Repository: daniellibin/gaiic2021_track3_querySim Branch: master Commit: 08a8079e1ffd Files: 701 Total size: 12.2 MB Directory structure: gitextract_r3nhpke0/ ├── README.md └── code/ ├── .gitignore ├── Config.py ├── Dockerfile ├── NEZHA/ │ ├── configuration_nezha.py │ └── modeling_nezha.py ├── bert-base-chinese/ │ └── config.json ├── bert-base-count3/ │ ├── finetuning/ │ │ ├── .ipynb_checkpoints/ │ │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ │ ├── Config.py │ │ ├── NEZHA/ │ │ │ ├── configuration_nezha.py │ │ │ └── modeling_nezha.py │ │ ├── model.py │ │ ├── models/ │ │ │ └── gitkeep │ │ ├── multi_gpu_QA.py │ │ └── utils.py │ └── pretrain/ │ ├── NLP_Utils.py │ ├── __init__.py │ ├── bert_model/ │ │ └── gitkeep │ ├── train_bert.py │ └── transformers1/ │ ├── __init__.py │ ├── __main__.py │ ├── activations.py │ ├── another_try.py │ ├── benchmark/ │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── benchmark_args.py │ │ ├── benchmark_args_utils.py │ │ └── benchmark_utils.py │ ├── benchmark_utils.py │ ├── commands/ │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── download.py │ │ ├── env.py │ │ ├── run.py │ │ ├── serving.py │ │ ├── train.py │ │ ├── transformers_cli.py │ │ └── user.py │ ├── configuration_albert.py │ ├── configuration_auto.py │ ├── configuration_bart.py │ ├── configuration_bert.py │ ├── configuration_camembert.py │ ├── configuration_ctrl.py │ ├── configuration_distilbert.py │ ├── configuration_electra.py │ ├── configuration_encoder_decoder.py │ ├── configuration_flaubert.py │ ├── configuration_gpt2.py │ ├── configuration_longformer.py │ ├── configuration_marian.py │ ├── configuration_mmbt.py │ ├── configuration_openai.py │ ├── configuration_reformer.py │ ├── configuration_roberta.py │ ├── configuration_t5.py │ ├── configuration_transfo_xl.py │ ├── configuration_utils.py │ ├── configuration_xlm.py │ ├── configuration_xlm_roberta.py │ ├── configuration_xlnet.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_electra_original_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_graph_to_onnx.py │ ├── convert_longformer_original_pytorch_lightning_to_pytorch.py │ ├── convert_marian_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_pytorch_checkpoint_to_tf2.py │ ├── convert_reformer_trax_checkpoint_to_pytorch.py │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── data/ │ │ ├── __init__.py │ │ ├── data_collator.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ └── language_modeling.py │ │ ├── metrics/ │ │ │ ├── __init__.py │ │ │ └── squad_metrics.py │ │ └── processors/ │ │ ├── __init__.py │ │ ├── glue.py │ │ ├── squad.py │ │ ├── utils.py │ │ └── xnli.py │ ├── file.py │ ├── file_utils.py │ ├── filep.py │ ├── hf_api.py │ ├── hf_argparser.py │ ├── modelcard.py │ ├── modeling_albert.py │ ├── modeling_auto.py │ ├── modeling_bart.py │ ├── modeling_beam_search.py │ ├── modeling_bert.py │ ├── modeling_camembert.py │ ├── modeling_ctrl.py │ ├── modeling_distilbert.py │ ├── modeling_electra.py │ ├── modeling_encoder_decoder.py │ ├── modeling_flaubert.py │ ├── modeling_gpt2.py │ ├── modeling_longformer.py │ ├── modeling_marian.py │ ├── modeling_mmbt.py │ ├── modeling_openai.py │ ├── modeling_reformer.py │ ├── modeling_roberta.py │ ├── modeling_t5.py │ ├── modeling_tf_albert.py │ ├── modeling_tf_auto.py │ ├── modeling_tf_bert.py │ ├── modeling_tf_camembert.py │ ├── modeling_tf_ctrl.py │ ├── modeling_tf_distilbert.py │ ├── modeling_tf_electra.py │ ├── modeling_tf_flaubert.py │ ├── modeling_tf_gpt2.py │ ├── modeling_tf_openai.py │ ├── modeling_tf_pytorch_utils.py │ ├── modeling_tf_roberta.py │ ├── modeling_tf_t5.py │ ├── modeling_tf_transfo_xl.py │ ├── modeling_tf_transfo_xl_utilities.py │ ├── modeling_tf_utils.py │ ├── modeling_tf_xlm.py │ ├── modeling_tf_xlm_roberta.py │ ├── modeling_tf_xlnet.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── modeling_utils.py │ ├── modeling_xlm.py │ ├── modeling_xlm_roberta.py │ ├── modeling_xlnet.py │ ├── optimization.py │ ├── optimization_tf.py │ ├── pipelines.py │ ├── tokenization_albert.py │ ├── tokenization_auto.py │ ├── tokenization_bart.py │ ├── tokenization_bert.py │ ├── tokenization_bert_japanese.py │ ├── tokenization_camembert.py │ ├── tokenization_ctrl.py │ ├── tokenization_distilbert.py │ ├── tokenization_electra.py │ ├── tokenization_flaubert.py │ ├── tokenization_gpt2.py │ ├── tokenization_longformer.py │ ├── tokenization_marian.py │ ├── tokenization_openai.py │ ├── tokenization_reformer.py │ ├── tokenization_roberta.py │ ├── tokenization_t5.py │ ├── tokenization_transfo_xl.py │ ├── tokenization_utils.py │ ├── tokenization_xlm.py │ ├── tokenization_xlm_roberta.py │ ├── tokenization_xlnet.py │ ├── trainer.py │ ├── trainer_tf.py │ ├── trainer_utils.py │ ├── training_args.py │ ├── training_args_tf.py │ ├── try.py │ └── utils_encoder_decoder.py ├── bert-base-count3-len100/ │ └── finetuning/ │ ├── .ipynb_checkpoints/ │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ ├── Config.py │ ├── NEZHA/ │ │ ├── configuration_nezha.py │ │ └── modeling_nezha.py │ ├── model.py │ ├── models/ │ │ └── gitkeep │ ├── multi_gpu_QA.py │ └── utils.py ├── bert-base-count5/ │ ├── finetuning/ │ │ ├── .ipynb_checkpoints/ │ │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ │ ├── Config.py │ │ ├── NEZHA/ │ │ │ ├── configuration_nezha.py │ │ │ └── modeling_nezha.py │ │ ├── model.py │ │ ├── models/ │ │ │ └── gitkeep │ │ ├── multi_gpu_QA.py │ │ └── utils.py │ └── pretrain/ │ ├── NLP_Utils.py │ ├── __init__.py │ ├── bert_model/ │ │ └── gitkeep │ ├── train_bert.py │ └── transformers1/ │ ├── __init__.py │ ├── __main__.py │ ├── activations.py │ ├── another_try.py │ ├── benchmark/ │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── benchmark_args.py │ │ ├── benchmark_args_utils.py │ │ └── benchmark_utils.py │ ├── benchmark_utils.py │ ├── commands/ │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── download.py │ │ ├── env.py │ │ ├── run.py │ │ ├── serving.py │ │ ├── train.py │ │ ├── transformers_cli.py │ │ └── user.py │ ├── configuration_albert.py │ ├── configuration_auto.py │ ├── configuration_bart.py │ ├── configuration_bert.py │ ├── configuration_camembert.py │ ├── configuration_ctrl.py │ ├── configuration_distilbert.py │ ├── configuration_electra.py │ ├── configuration_encoder_decoder.py │ ├── configuration_flaubert.py │ ├── configuration_gpt2.py │ ├── configuration_longformer.py │ ├── configuration_marian.py │ ├── configuration_mmbt.py │ ├── configuration_openai.py │ ├── configuration_reformer.py │ ├── configuration_roberta.py │ ├── configuration_t5.py │ ├── configuration_transfo_xl.py │ ├── configuration_utils.py │ ├── configuration_xlm.py │ ├── configuration_xlm_roberta.py │ ├── configuration_xlnet.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_electra_original_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_graph_to_onnx.py │ ├── convert_longformer_original_pytorch_lightning_to_pytorch.py │ ├── convert_marian_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_pytorch_checkpoint_to_tf2.py │ ├── convert_reformer_trax_checkpoint_to_pytorch.py │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── data/ │ │ ├── __init__.py │ │ ├── data_collator.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ └── language_modeling.py │ │ ├── metrics/ │ │ │ ├── __init__.py │ │ │ └── squad_metrics.py │ │ └── processors/ │ │ ├── __init__.py │ │ ├── glue.py │ │ ├── squad.py │ │ ├── utils.py │ │ └── xnli.py │ ├── file.py │ ├── file_utils.py │ ├── filep.py │ ├── hf_api.py │ ├── hf_argparser.py │ ├── modelcard.py │ ├── modeling_albert.py │ ├── modeling_auto.py │ ├── modeling_bart.py │ ├── modeling_beam_search.py │ ├── modeling_bert.py │ ├── modeling_camembert.py │ ├── modeling_ctrl.py │ ├── modeling_distilbert.py │ ├── modeling_electra.py │ ├── modeling_encoder_decoder.py │ ├── modeling_flaubert.py │ ├── modeling_gpt2.py │ ├── modeling_longformer.py │ ├── modeling_marian.py │ ├── modeling_mmbt.py │ ├── modeling_openai.py │ ├── modeling_reformer.py │ ├── modeling_roberta.py │ ├── modeling_t5.py │ ├── modeling_tf_albert.py │ ├── modeling_tf_auto.py │ ├── modeling_tf_bert.py │ ├── modeling_tf_camembert.py │ ├── modeling_tf_ctrl.py │ ├── modeling_tf_distilbert.py │ ├── modeling_tf_electra.py │ ├── modeling_tf_flaubert.py │ ├── modeling_tf_gpt2.py │ ├── modeling_tf_openai.py │ ├── modeling_tf_pytorch_utils.py │ ├── modeling_tf_roberta.py │ ├── modeling_tf_t5.py │ ├── modeling_tf_transfo_xl.py │ ├── modeling_tf_transfo_xl_utilities.py │ ├── modeling_tf_utils.py │ ├── modeling_tf_xlm.py │ ├── modeling_tf_xlm_roberta.py │ ├── modeling_tf_xlnet.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── modeling_utils.py │ ├── modeling_xlm.py │ ├── modeling_xlm_roberta.py │ ├── modeling_xlnet.py │ ├── optimization.py │ ├── optimization_tf.py │ ├── pipelines.py │ ├── tokenization_albert.py │ ├── tokenization_auto.py │ ├── tokenization_bart.py │ ├── tokenization_bert.py │ ├── tokenization_bert_japanese.py │ ├── tokenization_camembert.py │ ├── tokenization_ctrl.py │ ├── tokenization_distilbert.py │ ├── tokenization_electra.py │ ├── tokenization_flaubert.py │ ├── tokenization_gpt2.py │ ├── tokenization_longformer.py │ ├── tokenization_marian.py │ ├── tokenization_openai.py │ ├── tokenization_reformer.py │ ├── tokenization_roberta.py │ ├── tokenization_t5.py │ ├── tokenization_transfo_xl.py │ ├── tokenization_utils.py │ ├── tokenization_xlm.py │ ├── tokenization_xlm_roberta.py │ ├── tokenization_xlnet.py │ ├── trainer.py │ ├── trainer_tf.py │ ├── trainer_utils.py │ ├── training_args.py │ ├── training_args_tf.py │ ├── try.py │ └── utils_encoder_decoder.py ├── bert-base-count5-len32/ │ └── finetuning/ │ ├── .ipynb_checkpoints/ │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ ├── Config.py │ ├── NEZHA/ │ │ ├── configuration_nezha.py │ │ └── modeling_nezha.py │ ├── model.py │ ├── models/ │ │ └── gitkeep │ ├── multi_gpu_QA.py │ └── utils.py ├── build_vocab.py ├── docker_build.sh ├── main_fusion_thread.py ├── model.py ├── nezha-base-count3/ │ ├── finetuning/ │ │ ├── .ipynb_checkpoints/ │ │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ │ ├── Config.py │ │ ├── NEZHA/ │ │ │ ├── configuration_nezha.py │ │ │ └── modeling_nezha.py │ │ ├── model.py │ │ ├── models/ │ │ │ └── gitkeep │ │ ├── multi_gpu_QA.py │ │ └── utils.py │ └── pretrain/ │ ├── NEZHA/ │ │ ├── configuration_nezha.py │ │ └── modeling_nezha.py │ ├── NLP_Utils.py │ ├── __init__.py │ ├── nezha_model/ │ │ └── gitkeep │ ├── train_nezha.py │ └── transformers1/ │ ├── __init__.py │ ├── __main__.py │ ├── activations.py │ ├── another_try.py │ ├── benchmark/ │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── benchmark_args.py │ │ ├── benchmark_args_utils.py │ │ └── benchmark_utils.py │ ├── benchmark_utils.py │ ├── commands/ │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── download.py │ │ ├── env.py │ │ ├── run.py │ │ ├── serving.py │ │ ├── train.py │ │ ├── transformers_cli.py │ │ └── user.py │ ├── configuration_albert.py │ ├── configuration_auto.py │ ├── configuration_bart.py │ ├── configuration_bert.py │ ├── configuration_camembert.py │ ├── configuration_ctrl.py │ ├── configuration_distilbert.py │ ├── configuration_electra.py │ ├── configuration_encoder_decoder.py │ ├── configuration_flaubert.py │ ├── configuration_gpt2.py │ ├── configuration_longformer.py │ ├── configuration_marian.py │ ├── configuration_mmbt.py │ ├── configuration_openai.py │ ├── configuration_reformer.py │ ├── configuration_roberta.py │ ├── configuration_t5.py │ ├── configuration_transfo_xl.py │ ├── configuration_utils.py │ ├── configuration_xlm.py │ ├── configuration_xlm_roberta.py │ ├── configuration_xlnet.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_electra_original_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_graph_to_onnx.py │ ├── convert_longformer_original_pytorch_lightning_to_pytorch.py │ ├── convert_marian_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_pytorch_checkpoint_to_tf2.py │ ├── convert_reformer_trax_checkpoint_to_pytorch.py │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── data/ │ │ ├── __init__.py │ │ ├── data_collator.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ └── language_modeling.py │ │ ├── metrics/ │ │ │ ├── __init__.py │ │ │ └── squad_metrics.py │ │ └── processors/ │ │ ├── __init__.py │ │ ├── glue.py │ │ ├── squad.py │ │ ├── utils.py │ │ └── xnli.py │ ├── file.py │ ├── file_utils.py │ ├── filep.py │ ├── hf_api.py │ ├── hf_argparser.py │ ├── modelcard.py │ ├── modeling_albert.py │ ├── modeling_auto.py │ ├── modeling_bart.py │ ├── modeling_beam_search.py │ ├── modeling_bert.py │ ├── modeling_camembert.py │ ├── modeling_ctrl.py │ ├── modeling_distilbert.py │ ├── modeling_electra.py │ ├── modeling_encoder_decoder.py │ ├── modeling_flaubert.py │ ├── modeling_gpt2.py │ ├── modeling_longformer.py │ ├── modeling_marian.py │ ├── modeling_mmbt.py │ ├── modeling_openai.py │ ├── modeling_reformer.py │ ├── modeling_roberta.py │ ├── modeling_t5.py │ ├── modeling_tf_albert.py │ ├── modeling_tf_auto.py │ ├── modeling_tf_bert.py │ ├── modeling_tf_camembert.py │ ├── modeling_tf_ctrl.py │ ├── modeling_tf_distilbert.py │ ├── modeling_tf_electra.py │ ├── modeling_tf_flaubert.py │ ├── modeling_tf_gpt2.py │ ├── modeling_tf_openai.py │ ├── modeling_tf_pytorch_utils.py │ ├── modeling_tf_roberta.py │ ├── modeling_tf_t5.py │ ├── modeling_tf_transfo_xl.py │ ├── modeling_tf_transfo_xl_utilities.py │ ├── modeling_tf_utils.py │ ├── modeling_tf_xlm.py │ ├── modeling_tf_xlm_roberta.py │ ├── modeling_tf_xlnet.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── modeling_utils.py │ ├── modeling_xlm.py │ ├── modeling_xlm_roberta.py │ ├── modeling_xlnet.py │ ├── optimization.py │ ├── optimization_tf.py │ ├── pipelines.py │ ├── tokenization_albert.py │ ├── tokenization_auto.py │ ├── tokenization_bart.py │ ├── tokenization_bert.py │ ├── tokenization_bert_japanese.py │ ├── tokenization_camembert.py │ ├── tokenization_ctrl.py │ ├── tokenization_distilbert.py │ ├── tokenization_electra.py │ ├── tokenization_flaubert.py │ ├── tokenization_gpt2.py │ ├── tokenization_longformer.py │ ├── tokenization_marian.py │ ├── tokenization_openai.py │ ├── tokenization_reformer.py │ ├── tokenization_roberta.py │ ├── tokenization_t5.py │ ├── tokenization_transfo_xl.py │ ├── tokenization_utils.py │ ├── tokenization_xlm.py │ ├── tokenization_xlm_roberta.py │ ├── tokenization_xlnet.py │ ├── trainer.py │ ├── trainer_tf.py │ ├── trainer_utils.py │ ├── training_args.py │ ├── training_args_tf.py │ ├── try.py │ └── utils_encoder_decoder.py ├── nezha-base-count5/ │ ├── finetuning/ │ │ ├── .ipynb_checkpoints/ │ │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ │ ├── Config.py │ │ ├── NEZHA/ │ │ │ ├── configuration_nezha.py │ │ │ └── modeling_nezha.py │ │ ├── model.py │ │ ├── models/ │ │ │ └── gitkeep │ │ ├── multi_gpu_QA.py │ │ └── utils.py │ └── pretrain/ │ ├── NEZHA/ │ │ ├── configuration_nezha.py │ │ └── modeling_nezha.py │ ├── NLP_Utils.py │ ├── __init__.py │ ├── nezha_model/ │ │ └── gitkeep │ ├── train_nezha.py │ └── transformers1/ │ ├── __init__.py │ ├── __main__.py │ ├── activations.py │ ├── another_try.py │ ├── benchmark/ │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── benchmark_args.py │ │ ├── benchmark_args_utils.py │ │ └── benchmark_utils.py │ ├── benchmark_utils.py │ ├── commands/ │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── download.py │ │ ├── env.py │ │ ├── run.py │ │ ├── serving.py │ │ ├── train.py │ │ ├── transformers_cli.py │ │ └── user.py │ ├── configuration_albert.py │ ├── configuration_auto.py │ ├── configuration_bart.py │ ├── configuration_bert.py │ ├── configuration_camembert.py │ ├── configuration_ctrl.py │ ├── configuration_distilbert.py │ ├── configuration_electra.py │ ├── configuration_encoder_decoder.py │ ├── configuration_flaubert.py │ ├── configuration_gpt2.py │ ├── configuration_longformer.py │ ├── configuration_marian.py │ ├── configuration_mmbt.py │ ├── configuration_openai.py │ ├── configuration_reformer.py │ ├── configuration_roberta.py │ ├── configuration_t5.py │ ├── configuration_transfo_xl.py │ ├── configuration_utils.py │ ├── configuration_xlm.py │ ├── configuration_xlm_roberta.py │ ├── configuration_xlnet.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_electra_original_tf_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_graph_to_onnx.py │ ├── convert_longformer_original_pytorch_lightning_to_pytorch.py │ ├── convert_marian_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_pytorch_checkpoint_to_tf2.py │ ├── convert_reformer_trax_checkpoint_to_pytorch.py │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── data/ │ │ ├── __init__.py │ │ ├── data_collator.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ └── language_modeling.py │ │ ├── metrics/ │ │ │ ├── __init__.py │ │ │ └── squad_metrics.py │ │ └── processors/ │ │ ├── __init__.py │ │ ├── glue.py │ │ ├── squad.py │ │ ├── utils.py │ │ └── xnli.py │ ├── file.py │ ├── file_utils.py │ ├── filep.py │ ├── hf_api.py │ ├── hf_argparser.py │ ├── modelcard.py │ ├── modeling_albert.py │ ├── modeling_auto.py │ ├── modeling_bart.py │ ├── modeling_beam_search.py │ ├── modeling_bert.py │ ├── modeling_camembert.py │ ├── modeling_ctrl.py │ ├── modeling_distilbert.py │ ├── modeling_electra.py │ ├── modeling_encoder_decoder.py │ ├── modeling_flaubert.py │ ├── modeling_gpt2.py │ ├── modeling_longformer.py │ ├── modeling_marian.py │ ├── modeling_mmbt.py │ ├── modeling_openai.py │ ├── modeling_reformer.py │ ├── modeling_roberta.py │ ├── modeling_t5.py │ ├── modeling_tf_albert.py │ ├── modeling_tf_auto.py │ ├── modeling_tf_bert.py │ ├── modeling_tf_camembert.py │ ├── modeling_tf_ctrl.py │ ├── modeling_tf_distilbert.py │ ├── modeling_tf_electra.py │ ├── modeling_tf_flaubert.py │ ├── modeling_tf_gpt2.py │ ├── modeling_tf_openai.py │ ├── modeling_tf_pytorch_utils.py │ ├── modeling_tf_roberta.py │ ├── modeling_tf_t5.py │ ├── modeling_tf_transfo_xl.py │ ├── modeling_tf_transfo_xl_utilities.py │ ├── modeling_tf_utils.py │ ├── modeling_tf_xlm.py │ ├── modeling_tf_xlm_roberta.py │ ├── modeling_tf_xlnet.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── modeling_utils.py │ ├── modeling_xlm.py │ ├── modeling_xlm_roberta.py │ ├── modeling_xlnet.py │ ├── optimization.py │ ├── optimization_tf.py │ ├── pipelines.py │ ├── tokenization_albert.py │ ├── tokenization_auto.py │ ├── tokenization_bart.py │ ├── tokenization_bert.py │ ├── tokenization_bert_japanese.py │ ├── tokenization_camembert.py │ ├── tokenization_ctrl.py │ ├── tokenization_distilbert.py │ ├── tokenization_electra.py │ ├── tokenization_flaubert.py │ ├── tokenization_gpt2.py │ ├── tokenization_longformer.py │ ├── tokenization_marian.py │ ├── tokenization_openai.py │ ├── tokenization_reformer.py │ ├── tokenization_roberta.py │ ├── tokenization_t5.py │ ├── tokenization_transfo_xl.py │ ├── tokenization_utils.py │ ├── tokenization_xlm.py │ ├── tokenization_xlm_roberta.py │ ├── tokenization_xlnet.py │ ├── trainer.py │ ├── trainer_tf.py │ ├── trainer_utils.py │ ├── training_args.py │ ├── training_args_tf.py │ ├── try.py │ └── utils_encoder_decoder.py ├── nezha-cn-base/ │ ├── config.json │ └── vocab.txt ├── requirements.txt ├── run.sh ├── serial_main_fusion_thread.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # 0.前言 决赛答辩已经过去一段时间了,我们队伍ac milan最终获得了复赛第3,决赛第4的成绩。在此首先感谢一些队友的carry~ 经过2个多月的比赛,学习收获了很多,也认识了很多大佬,在这里记录一下自己的参赛体验和学习收获。 [github地址]: https://github.com/daniellibin/gaiic2021_track3_querySim [比赛地址]: https://tianchi.aliyun.com/competition/entrance/531851/introduction # 1.赛题背景 小布助手是OPPO公司为欧加集团三品牌手机和IoT设备自研的语音助手,为用户提供了有趣、贴心、便捷的对话式服务。意图识别是对话系统中的一个核心任务,而对话短文本语义匹配是意图识别的主流算法方案之一。本赛题要求参赛队伍根据脱敏后的短文本query-pair,预测它们是否属于同一语义,提交的结果按照指定的评价指标使用在线评测数据进行评测和排名,得分最优者获胜。 # 2.赛题描述及数据说明 - ### 训练数据 训练数据包含输入query-pair,以及对应的真值。初赛训练样本10万,复赛训练样本30万,这份数据主要用于参赛队伍训练模型,为确保数据的高质量,每一个样本的真值都有进行人工标注校验。每行为一个训练样本,由query-pair和真值组成,每行格式如下: - query-pair格式:query以中文为主,中间可能带有少量英文单词(如英文缩写、品牌词、设备型号等),采用UTF-8编码,未分词,两个query之间使用\t分割。 - 真值:真值可为0或1,其中1代表query-pair语义相匹配,0则代表不匹配,真值与query-pair之间也用\t分割。 ### 训练数据样本举例(空白间隔为\t): ``` 肖战的粉丝叫什么名字 肖战的粉丝叫什么 1 王者荣耀里面打野谁最厉害 王者荣耀什么英雄最好玩 0 我想换个手机 我要换手机 1 我是张睿 我想张睿 0 不想 不想说 0 ``` ### 测试数据 脱敏后的query-pair数据,初赛采用A/B榜的方式,A榜和B榜样本规模分别为2.5万,发布时间以赛制为准,初赛队伍根据初赛B榜排名择优进入复赛;复赛同样采用A/B榜的方式,样本规模5万(与初赛不重复),复赛队伍根据复赛B榜排名择优进入现场答辩。 ### 测试数据样本举例(空白间隔为\t) ``` 肖战的粉丝叫什么名字 肖战的粉丝叫什么 王者荣耀里面打野谁最厉害 王者荣耀什么英雄最好玩 我想换个手机 我要换手机 我是张睿 我想张睿 不想 不想说 ``` # 3.评估标准 比赛的评估标准由性能标准和效果标准两部分组成,初赛采用效果标准,`AUC` 指标。 # 4.整体设计 ![image-20210619163346172](README.assets/image-20210619163346172.png) ## (1)预训练 #### a.模型选取 赛题所给数据经过了脱敏,相当于一种新的语言,无法直接利用开源的预训练模型进行迁移学习 但是预训练依然很有必要,在有限的数据上,我们需要尽可能充分地利用其中的信息,Bert语言模型的MLM预训练任务可以利用无监督文本信息,学习文本表征、语言学知识和世界性知识 我们选用的是Bert和其变种Nezha,二者主要区别在于绝对位置编码与相对位置编码 考虑到后续的模型融合以及线上环境提供四卡,我们预训练了四个模型,参数量皆为1亿左右 ![image-20210619163530653](README.assets/image-20210619163530653.png) #### b.MASK策略 模型输入为经典的拼接形式:[CLS] s1 [SEP] s2 [SEP] 对偶:s1、s2以50%的概率交换位置,是对语义无损的数据增强方式 长度自适应动态N-gram Mask策略 - 动态Mask:预训练达到400 epoch,上百万次iter,可以每次迭代都随机生成新的mask文本,增强模型泛化能力 - N-gram Mask:以15%的概率选中token,为增加训练难度,选中部分以70%、20%、10%的概率进行1-gram、2-gram、3-gram片段的mask(选中token使用[MASK]、随机词、自身替换的概率和原版Bert一致) - 长度自适应:考虑到对短文本进行过较长gram的mask对语义有较大破坏,长度小于7的文本不进行3-gram mask,小于4的文本不进行2-gram mask - 防止小概率的连续Mask:已经mask了的文本片段,强制跳过下一个token的mask,防止一长串连续的mask #### c.其他Trick与参数设置 - 学习率warmup与衰减 - 预训练400 epoch ,前4.5个epoch,学习率从0线性增长到5e-5,之后线性衰减到1e-5 - 分块shuffle - 预训练周期长,优化时间性能非常重要,分块shuffle将长度差不多的样本组成batch快,块间shuffle,减少padding部分运算量,耗时减少了约40%,实测不会降低模型效果 - 权重衰减 - 限制网络权值的大小,缓解过拟合现象 - 四个模型通用参数设置 ![image-20210619170554408](README.assets/image-20210619170554408.png) ## (2)微调 #### a.模型参数 - 预训练利用文本中的无监督信息,微调则需利用有监督的句子对匹配信息,将赛题任务建模为匹配与不匹配的二分类问题 - 我们在4个预训练模型的基础上,训练了6个微调模型,从词表、截断长度和模型结构等维度保证模型之间的差异性,以便后序模型融合,参数设置对比如下: ![image-20210619170702479](README.assets/image-20210619170702479.png) #### b.后接结构 - Bert/Nezha后接的三种结构 ![image-20210619170927378](README.assets/image-20210619170927378.png) 考虑到Bert已经具备强大的特征提取能力,以及运行和推理时限严格,所以其只后接了一些简单的结构。 #### c.Trick - 学习率 - warmup与衰减:可以使得训练初期学习率较小,模型可以慢慢趋于稳定,待相对稳定后再以预先设置的学习率进行训练,使得模型收敛速度变得更快。后采用学习率衰减的方式使模型收敛到更佳的极值点,提升最终效果 - 不同模型采用不同的学习率(2e-5或4e-5) - 模型融合时先对logits加权平均,后softmax - 使得softmax不再是每个模型独立进行,而是综合利用所有模型信息 - 对抗训练 - 对抗训练是一种引入噪声的训练方式,可以对参数进行正则化,提升模型鲁棒性和泛化能力 Fast Gradient Method (FGM):对embedding层在梯度方向添加扰动 Projected Gradient Descent (PGD) :迭代扰动,每次扰动被投影到规定范围内 团队实验了FGM、PGD,前者速度快且效果更佳。 #### d.通用参数 最佳参数 - batch_size=32,预训练充分的情况下,微调收敛非常快,小bs带来更大的随机性,更不容易过早陷入局部最优 - epoch=3 - dropout=0.2,训练时以一定概率丢弃某些神经元,缓解过拟合 - FGM,epsilon=0.25时效果最佳 ## (3)模型融合与推理 ![image-20210619171224369](README.assets/image-20210619171224369.png) ## (4)性能优化 #### a.分块shuffle - 赛题限制线上总运行时间为80小时,限制推理5w测试集时间为15分钟(含网络开销),性能优化尤为关键 - 分块shuffle将长度差不多的样本组成batch快,块间shuffle,减少padding部分运算量,预训练耗时减少了约40% - 最终预训练线上能控制在9分多钟一个epoch,400个epoch能控制在65小时以内完成 ![image-20210619171438518](README.assets/image-20210619171438518.png) #### b.推理加速 - ONNX Runtime:ONNX Runtime是机器学习模型的预测引擎,能使用内置的图优化(Graph Optimization)和各种硬件加速功能,来优化和加速推理。像BERT这样的Transformer模型,由许多运算符(Operator)的图构成,ONNX Runtime内置图优化功能,可以简化图并且减少节点,进而执行更复杂的节点融合和布局优化。通过使用ONNX Runtime,推理部分获得了非常可观的加速。 ![image-20210619171514789](README.assets/image-20210619171514789.png) #### c.对cuda版本的调优 - 在大家使用较多的cuda11镜像中,我们发现线上V100速度较慢,根据以往项目经验,老一些的卡用较新的cuda版本未必能发挥出最好的性能,我们尝试更换镜像版本为cuda10.2,cudnn版本配套改为7,onnxruntime-gpu版本配套改为1.5.1,推理速度有了较大提升,使得在15分钟内我们能跑6个模型(以往为4个) ![image-20210619171554475](README.assets/image-20210619171554475.png) #### d.其他细节 - 减少内存到显存的通信开销:避免使用.to('cuda')的方式将tensor从内存移至显存,增加通信开销,而是一开始就用torch.tensor(xxx,device='cuda')的方式将tensor创建在显存 - 编写更快的分词函数:所给数据已经用空格将token隔开,避免使用tokenize函数将数据整体当做字符串进行分词,而是按空格split后直接convert_tokens_to_ids - …… # 5.创新和落地 #### a.创新 - 融入对偶的长度自适应动态N-gram Mask策略 - 不同词表、不同截断长度、不同结构的模型融合,保证模型差异性 - 学习率warmup与衰减、模型权重衰减、对抗训练等Trick - 性能优化,包括分块shuffle、ONNX Runtime的使用、对cuda版本的调优和其他细节优化 #### b.落地 - 我们的模型将语义匹配转换为分类问题,这是一种通用性非常强的解决方案,可以广泛落地于自然语言处理领域中涉及到句子关系的各项任务中,如开放域意图识别(本赛题)、QQ匹配、QA匹配、文本蕴含等 - 推理速度较快,不计网络通信消耗,比赛使用的6模(4 Bert,2 Nezha)融合后可达77的QPS(AUC 0.9579),在牺牲不到一个百分点的AUC下,单模Bert可达595的QPS(AUC 0.948) - 实际生产环境复杂,短文本相对容易出现语义缺失,且受噪声影响相对更大(用户输错或语音识别错误几个字,占短文本整体的比例可能就较大),可能需考虑辅以指代消解、文本补全、文本纠错等技术 - 深度学习并非万能,实际落地时,需要不断进行badcase分析,适当辅以规则的方法提升系统鲁棒性 # 6.方案总结 - 总结性回答 - 我们从预训练、微调、模型融合和推理四个方面入手,每个阶段进行针对性的策略改进及创新,辅以性能优化,最终形成了一个较好的端到端解决方案,可以广泛落地于自然语言处理领域中涉及到句子关系的各项任务中,具有较好的实用性和创新性。 - 方法优劣势分析、展望 - 优点:效果好,速度快,模型通用性强 - 缺点:交互型模型因为每次计算都需要输入完整句子对,不适合于从海量文本中召回结果,而是适合在召回小部分候选集后,进行精细的排序 - 展望:从科学研究角度,我们要利用好预训练模型这个核武器,设计更有针对性,更加合理的预训练任务,此外也可探索结合上下文、引入知识的多轮匹配任务。从应用角度,可以从badcase出发,不断优化算法,挖掘用户需求,让小布成为一个知识更加渊博,对话更加流畅,更加人性化的智能助理 # 7.前排大佬解决方案 # 一、AI小花 https://github.com/nilboy/gaic_track3_pair_sim ![](README.assets/image-20210619194942961.png) ![image-20210619194951696](README.assets/image-20210619194951696.png) # 二、[none] ![image-20210619200120075](README.assets/image-20210619200120075.png) ![image-20210619200126641](README.assets/image-20210619200126641.png) ![image-20210619200137247](README.assets/image-20210619200137247.png) # 三、赛道3-白[MASK] ![image-20210619204111156](README.assets/image-20210619204111156.png) ![image-20210619204017855](README.assets/image-20210619204017855.png) ![image-20210619204120283](README.assets/image-20210619204120283.png) ![image-20210619204128548](README.assets/image-20210619204128548.png) # 四、科讯嘉联灵珠团队 ![image-20210619205915821](README.assets/image-20210619205915821.png) ![image-20210619210050654](README.assets/image-20210619210050654.png) # 五、LOL王者 ![image-20210619210251396](README.assets/image-20210619210251396.png) ![image-20210619210301353](README.assets/image-20210619210301353.png) ================================================ FILE: code/.gitignore ================================================ bert-base-chinese/pytorch_model.bin nezha-cn-base/pytorch_model.bin .idea .DS_Store __pycache__ ================================================ FILE: code/Config.py ================================================ from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig, ElectraModel, ElectraConfig, ElectraTokenizer, \ RobertaTokenizer, RobertaModel, RobertaConfig from NEZHA.modeling_nezha import NeZhaModel from NEZHA.configuration_nezha import NeZhaConfig MODELS = { 'BertForClass': BertModel, 'BertForClass_MultiDropout': BertModel, 'BertLastTwoCls': BertModel, 'BertLastCls':BertModel, 'BertLastTwoClsPooler': BertModel, 'BertLastTwoEmbeddings': BertModel, 'BertLastTwoEmbeddingsPooler': BertModel, 'BertLastFourCls': BertModel, 'BertLastFourClsPooler': BertModel, 'BertLastFourEmbeddings': BertModel, 'BertLastFourEmbeddingsPooler': BertModel, 'BertDynCls': BertModel, 'BertDynEmbeddings': BertModel, 'BertRNN': BertModel, 'BertCNN': XLNetModel, 'BertRCNN': BertModel, 'XLNet': XLNetModel, 'Electra': ElectraModel, 'NEZHA': NeZhaModel } TOKENIZERS = { 'BertForClass': BertTokenizer, 'BertForClass_MultiDropout': BertTokenizer, 'BertLastTwoCls': BertTokenizer, 'BertLastCls': BertTokenizer, 'BertLastTwoClsPooler': BertTokenizer, 'BertLastTwoEmbeddings': BertTokenizer, 'BertLastTwoEmbeddingsPooler': BertTokenizer, 'BertLastFourCls': BertTokenizer, 'BertLastFourClsPooler': BertTokenizer, 'BertLastFourEmbeddings': BertTokenizer, 'BertLastFourEmbeddingsPooler': BertTokenizer, 'BertDynCls': BertTokenizer, 'BertDynEmbeddings': BertTokenizer, 'BertRNN': BertTokenizer, 'BertCNN': BertTokenizer, 'BertRCNN': BertTokenizer, 'XLNet': XLNetTokenizer, 'Electra': ElectraTokenizer, 'NEZHA': BertTokenizer } CONFIGS = { 'BertForClass': BertConfig, 'BertForClass_MultiDropout': BertConfig, 'BertLastTwoCls': BertConfig, 'BertLastCls': BertConfig, 'BertLastTwoClsPooler': BertConfig, 'BertLastTwoEmbeddings': BertConfig, 'BertLastTwoEmbeddingsPooler': BertConfig, 'BertLastFourCls': BertConfig, 'BertLastFourClsPooler': BertConfig, 'BertLastFourEmbeddings': BertConfig, 'BertLastFourEmbeddingsPooler': BertConfig, 'BertDynCls': BertConfig, 'BertDynEmbeddings': BertConfig, 'BertRNN': BertConfig, 'BertCNN': BertConfig, 'BertRCNN': BertConfig, 'XLNet': XLNetConfig, 'Electra': ElectraConfig, 'NEZHA': NeZhaConfig } ================================================ FILE: code/Dockerfile ================================================ # Base Images ## 从天池基础镜像构建(from的base img 根据自己的需要更换,建议使用天池open list镜像链接:https://tianchi.aliyun.com/forum/postDetail?postId=67720) #FROM registry.cn-shanghai.aliyuncs.com/tcc-public/pytorch:1.6-cuda10.1-py3 FROM registry.cn-shanghai.aliyuncs.com/xiaobu_match/match:cuda10.2base ## 把当前文件夹里的文件构建到镜像的根目录下 ADD . / ##安装依赖包,pip包请在requirements.txt添加 #RUN apt-get update && apt-get install -y curl #RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple #pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple transformers==4.2.0 #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tqdm #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple flask #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pandas #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple psutil #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple onnx #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple onnxruntime-gpu==1.7.0 #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple sklearn #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple onnxruntime_tools #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple sympy #RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple sentencepiece ## 指定默认工作目录为根目录(需要把run.sh和生成的结果文件都放在该文件夹下,提交后才能运行) WORKDIR / ## 镜像启动后统一执行 sh run.sh CMD ["sh", "run.sh"] ================================================ FILE: code/NEZHA/configuration_nezha.py ================================================ from transformers import PretrainedConfig NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class NeZhaConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. It is used to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALBERT `xxlarge `__ architecture. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30000): Vocabulary size of the ALBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of vocabulary embeddings. hidden_size (:obj:`int`, optional, defaults to 4096): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_hidden_groups (:obj:`int`, optional, defaults to 1): Number of groups for the hidden layers, parameters in the same group are shared. num_attention_heads (:obj:`int`, optional, defaults to 64): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 16384): The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. inner_group_num (:obj:`int`, optional, defaults to 1): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for attached classifiers. Example:: from transformers import AlbertConfig, AlbertModel # Initializing an ALBERT-xxlarge style configuration albert_xxlarge_configuration = AlbertConfig() # Initializing an ALBERT-base style configuration albert_base_configuration = AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) # Initializing a model from the ALBERT-base style configuration model = AlbertModel(albert_xxlarge_configuration) # Accessing the model configuration configuration = model.config Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "nezha" def __init__( self, vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, num_hidden_groups=1, num_attention_heads=64, intermediate_size=16384, inner_group_num=1, hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, max_relative_position=64, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, use_relative_position=True, pad_token_id=0, bos_token_id=2, eos_token_id=3, **kwargs ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_hidden_groups = num_hidden_groups self.num_attention_heads = num_attention_heads self.inner_group_num = inner_group_num self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.max_relative_position = max_relative_position self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_relative_position=use_relative_position self.classifier_dropout_prob = classifier_dropout_prob ================================================ FILE: code/NEZHA/modeling_nezha.py ================================================ import math import os import warnings from dataclasses import dataclass from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from transformers.activations import ACT2FN from transformers.file_utils import ( ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings, ) from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions, CausalLMOutputWithCrossAttentions, MaskedLMOutput, MultipleChoiceModelOutput, NextSentencePredictorOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from transformers.modeling_utils import ( PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer, ) from transformers.models.bert.configuration_bert import BertConfig import logging logger = logging.getLogger(__name__) _CHECKPOINT_FOR_DOC = "bert-base-uncased" _CONFIG_FOR_DOC = "BertConfig" _TOKENIZER_FOR_DOC = "BertTokenizer" def load_tf_weights_in_bert(model, config, tf_checkpoint_path): """Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] for n in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert ( pointer.shape == array.shape ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings def relative_position_encoding(depth, max_length=512, max_relative_position=64): vocab_size = max_relative_position * 2 + 1 range_vec = torch.arange(max_length) range_mat = range_vec.repeat(max_length).view(max_length, max_length) distance_mat = range_mat - torch.t(range_mat) distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) final_mat = distance_mat_clipped + max_relative_position embeddings_table = torch.zeros(vocab_size, depth) position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) embeddings_table[:, 0::2] = torch.sin(position * div_term) embeddings_table[:, 1::2] = torch.cos(position * div_term) embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1) flat_relative_positions_matrix = final_mat.view(-1) one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix, num_classes=vocab_size).float() positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) my_shape = list(final_mat.size()) my_shape.append(depth) positions_encoding = positions_encoding.view(my_shape) return positions_encoding class BertSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, relations_kv=None ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention and past_key_value is not None: # reuse k,v, cross_attentions key_layer = past_key_value[0] value_layer = past_key_value[1] attention_mask = encoder_attention_mask elif is_cross_attention: key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = torch.cat([past_key_value[0], key_layer], dim=2) value_layer = torch.cat([past_key_value[1], value_layer], dim=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) query_layer = self.transpose_for_scores(mixed_query_layer) if self.is_decoder: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention # key/value_states (first "if" case) # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of # all previous decoder key/value_states. Further calls to uni-directional self-attention # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) # if encoder bi-directional self-attention `past_key_value` is always `None` past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() query_layer_t = query_layer.permute(2, 0, 1, 3) query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, self.attention_head_size) key_position_scores = torch.matmul(query_layer_r, relations_kv.permute(0, 2, 1)) key_position_scores_r = key_position_scores.view(from_seq_length, batch_size, num_attention_heads, from_seq_length) key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) attention_scores = attention_scores + key_position_scores_r_t attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in NeZhaModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) attention_probs_t = attention_probs.permute(2, 0, 1, 3) attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, to_seq_length) value_position_scores = torch.matmul(attentions_probs_r, relations_kv) value_position_scores_r = value_position_scores.view(from_seq_length, batch_size, num_attention_heads, self.attention_head_size) value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) context_layer = context_layer + value_position_scores_r_t context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) if self.is_decoder: outputs = outputs + (past_key_value,) return outputs class BertSelfOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertAttention(nn.Module): def __init__(self, config): super().__init__() self.self = BertSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices( heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads ) # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, relations_kv=None ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions, relations_kv=relations_kv ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class BertIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class BertOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertLayer(nn.Module): def __init__(self, config): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.attention = BertAttention(config) self.is_decoder = config.is_decoder self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, relations_kv=None ): # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, past_key_value=self_attn_past_key_value, relations_kv=relations_kv ) attention_output = self_attention_outputs[0] # if decoder, the last output is tuple of self-attn cache if self.is_decoder: outputs = self_attention_outputs[1:-1] present_key_value = self_attention_outputs[-1] else: outputs = self_attention_outputs[1:] # add self attentions if we output attention weights cross_attn_present_key_value = None if self.is_decoder and encoder_hidden_states is not None: assert hasattr( self, "crossattention" ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`" # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, cross_attn_past_key_value, output_attentions, ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights # add cross-attn cache to positions 3,4 of present_key_value tuple cross_attn_present_key_value = cross_attention_outputs[-1] present_key_value = present_key_value + cross_attn_present_key_value layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output ) outputs = (layer_output,) + outputs # if decoder, return the attn key/values as the last output if self.is_decoder: outputs = outputs + (present_key_value,) return outputs def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) return layer_output class NeZhaEncoder(nn.Module): def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings, depth=int(config.hidden_size / config.num_attention_heads), max_relative_position=config.max_relative_position).to('cuda') def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=False, ): to_seq_length=hidden_states.shape[1] relations_kv = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None next_decoder_cache = () if use_cache else None for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None past_key_value = past_key_values[i] if past_key_values is not None else None if getattr(self.config, "gradient_checkpointing", False) and self.training: if use_cache: logger.warn( "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting " "`use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, past_key_value, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, ) else: layer_outputs = layer_module( hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions,relations_kv=relations_kv ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[-1],) if output_attentions: all_self_attentions = all_self_attentions + (layer_outputs[1],) if self.config.add_cross_attention: all_cross_attentions = all_cross_attentions + (layer_outputs[2],) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple( v for v in [ hidden_states, next_decoder_cache, all_hidden_states, all_self_attentions, all_cross_attentions, ] if v is not None ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, cross_attentions=all_cross_attentions, ) class BertPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states class BertOnlyMLMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class BertOnlyNSPHead(nn.Module): def __init__(self, config): super().__init__() self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) return seq_relationship_score class BertPreTrainingHeads(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, sequence_output, pooled_output): prediction_scores = self.predictions(sequence_output) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score class BertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) @dataclass class BertForPreTrainingOutput(ModelOutput): """ Output type of :class:`~transformers.BertForPreTraining`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[torch.FloatTensor] = None prediction_logits: torch.FloatTensor = None seq_relationship_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None BERT_START_DOCSTRING = r""" This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a PyTorch `torch.nn.Module `__ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ BERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`~transformers.BertTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: - 0 corresponds to a `sentence A` token, - 1 corresponds to a `sentence B` token. `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`): Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. """ @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class NeZhaModel(BertPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an input to the forward pass. """ def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = NeZhaEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) @add_start_docstrings( """ Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class BertForPreTraining(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertPreTrainingHeads(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, labels=None, next_sentence_label=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``: - 0 indicates sequence B is a continuation of sequence A, - 1 indicates sequence B is a random sequence. kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): Used to hide legacy arguments that have been deprecated. Returns: Example:: >>> from transformers import BertTokenizer, BertForPreTraining >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) total_loss = None if labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output return BertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING ) class BertLMHeadModel(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [ r"predictions.decoder.bias"] def __init__(self, config): super().__init__(config) if not config.is_decoder: logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`") self.bert = NeZhaModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). Returns: Example:: >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') >>> config = BertConfig.from_pretrained("bert-base-cased") >>> config.is_decoder = True >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> prediction_logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) lm_loss = None if labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss() lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((lm_loss,) + output) if lm_loss is not None else output return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # cut decoder_input_ids if past is used if past is not None: input_ids = input_ids[:, -1:] return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past} def _reorder_cache(self, past, beam_idx): reordered_past = () for layer_past in past: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class NeZhaForMaskedLM(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"predictions.decoder.bias"] def __init__(self, config): super().__init__(config) if config.is_decoder: logger.warning( "If you want to use `NeZhaForMaskedLM` make sure `config.is_decoder=False` for " "bi-directional self-attention." ) self.bert = NeZhaModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] # add a dummy token assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) dummy_token = torch.full( (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device ) input_ids = torch.cat([input_ids, dummy_token], dim=1) return {"input_ids": input_ids, "attention_mask": attention_mask} @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class BertForNextSentencePrediction(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=False, **kwargs ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring). Indices should be in ``[0, 1]``: - 0 indicates sequence B is a continuation of sequence A, - 1 indicates sequence B is a random sequence. Returns: Example:: >>> from transformers import BertTokenizer, BertForNextSentencePrediction >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt') >>> outputs = model(**encoding, labels=torch.LongTensor([1])) >>> logits = outputs.logits >>> assert logits[0, 0] < logits[0, 1] # next sentence was random """ if "next_sentence_label" in kwargs: warnings.warn( "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.", FutureWarning, ) labels = kwargs.pop("next_sentence_label") return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] seq_relationship_scores = self.cls(pooled_output) next_sentence_loss = None if labels is not None: loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) if not return_dict: output = (seq_relationship_scores,) + outputs[2:] return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """ Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class BertForSequenceClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """ Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class BertForMultipleChoice(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See :obj:`input_ids` above) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None inputs_embeds = ( inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None ) outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """ Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class BertForTokenClassification(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config, add_pooling_layer=False) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """ Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class BertForQuestionAnswering(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=False, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ================================================ FILE: code/bert-base-chinese/config.json ================================================ { "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "directionality": "bidi", "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "pooler_fc_size": 768, "pooler_num_attention_heads": 12, "pooler_num_fc_layers": 3, "pooler_size_per_head": 128, "pooler_type": "first_token_transform", "type_vocab_size": 2, "vocab_size": 21128 } ================================================ FILE: code/bert-base-count3/finetuning/.ipynb_checkpoints/PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved. \n", "Licensed under the MIT License." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inference PyTorch Bert Model with ONNX Runtime on GPU" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutorial, you'll learn how to load a Bert model from PyTorch, convert it to ONNX, and inference it for high performance using ONNX Runtime and NVIDIA GPU. In the following sections, we are going to use the Bert model trained with Stanford Question Answering Dataset (SQuAD) dataset as an example. Bert SQuAD model is used in question answering scenarios, where the answer to every question is a segment of text from the corresponding reading passage, or the question might be unanswerable.\n", "\n", "This notebook is for GPU inference. For CPU inference, please look at another notebook [Inference PyTorch Bert Model with ONNX Runtime on CPU](PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Prerequisites ##\n", "It requires your machine to have a GPU, and a python environment with [PyTorch](https://pytorch.org/) installed before running this notebook.\n", "\n", "#### GPU Environment Setup using AnaConda\n", "\n", "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n", "\n", "```console\n", "conda create -n gpu_env python=3.7\n", "conda activate gpu_env\n", "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n", "conda install -c anaconda ipykernel\n", "conda install -c conda-forge ipywidgets\n", "python -m ipykernel install --user --name=gpu_env_py37\n", "jupyter notebook\n", "```\n", "Finally, launch Jupyter Notebook and you can choose gpu_env_py37 as kernel to run this notebook.\n", "\n", "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the corresponding version in [requirements](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements). If the version is different from above cudatoolkit version, you have to install them separately, and add their bin directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Skipping onnxruntime-gpu as it is not installed.\u001b[0m\r\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip uninstall --quiet --yes onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet --upgrade transformers\n", "!{sys.executable} -m pip install --quiet --upgrade onnxconverter_common\n", "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n", "!{sys.executable} -m pip install --quiet wget netron pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Load Pretrained Bert model ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We begin by downloading the SQuAD data file and store them in the specified location. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "cache_dir = \"./squad\"\n", "if not os.path.exists(cache_dir):\n", " os.makedirs(cache_dir)\n", "\n", "predict_file_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json\"\n", "predict_file = os.path.join(cache_dir, \"dev-v1.1.json\")\n", "if not os.path.exists(predict_file):\n", " import wget\n", " print(\"Start downloading predict file.\")\n", " wget.download(predict_file_url, predict_file)\n", " print(\"Predict file downloaded.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's first define some constant variables." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Whether allow overwriting existing ONNX model and download the latest script from GitHub\n", "enable_overwrite = True\n", "\n", "# Total samples to inference, so that we can get average latency\n", "total_samples = 1000\n", "\n", "# ONNX opset version\n", "opset_version=11" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Specify some model configuration variables." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# For fine-tuned large model, the model name is \"bert-large-uncased-whole-word-masking-finetuned-squad\". Here we use bert-base for demo.\n", "model_name_or_path = \"bert-base-cased\"\n", "max_seq_length = 128\n", "doc_stride = 128\n", "max_query_length = 64" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Start to load model from pretrained. This step could take a few minutes. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 48/48 [00:04<00:00, 11.28it/s]\n", "convert squad examples to features: 100%|██████████| 1000/1000 [00:09<00:00, 102.15it/s]\n", "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 161306.98it/s]\n" ] } ], "source": [ "# The following code is adapted from HuggingFace transformers\n", "# https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n", "\n", "from transformers import (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "\n", "# Load pretrained model and tokenizer\n", "config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n", "tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n", "model = model_class.from_pretrained(model_name_or_path,\n", " from_tf=False,\n", " config=config,\n", " cache_dir=cache_dir)\n", "# load some examples\n", "from transformers.data.processors.squad import SquadV1Processor\n", "\n", "processor = SquadV1Processor()\n", "examples = processor.get_dev_examples(None, filename=predict_file)\n", "\n", "from transformers import squad_convert_examples_to_features\n", "features, dataset = squad_convert_examples_to_features( \n", " examples=examples[:total_samples], # convert enough examples for this notebook\n", " tokenizer=tokenizer,\n", " max_seq_length=max_seq_length,\n", " doc_stride=doc_stride,\n", " max_query_length=max_query_length,\n", " is_training=False,\n", " return_dataset='pt'\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Export the loaded model ##\n", "Once the model is loaded, we can export the loaded PyTorch model to ONNX." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model exported at ./onnx/bert-base-cased-squad_opset11.onnx\n" ] } ], "source": [ "output_dir = \"./onnx\"\n", "if not os.path.exists(output_dir):\n", " os.makedirs(output_dir) \n", "export_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opset{}.onnx'.format(opset_version))\n", "\n", "import torch\n", "use_gpu = torch.cuda.is_available()\n", "device = torch.device(\"cuda\" if use_gpu else \"cpu\")\n", "\n", "# Get the first example data to run the model and export it to ONNX\n", "data = dataset[0]\n", "inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", "}\n", "\n", "# Set model to inference mode, which is required before exporting the model because some operators behave differently in \n", "# inference and training mode.\n", "model.eval()\n", "model.to(device)\n", "\n", "if enable_overwrite or not os.path.exists(export_model_path):\n", " with torch.no_grad():\n", " symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}\n", " torch.onnx.export(model, # model being run\n", " args=tuple(inputs.values()), # model input (or a tuple for multiple inputs)\n", " f=export_model_path, # where to save the model (can be a file or file-like object)\n", " opset_version=opset_version, # the ONNX version to export the model to\n", " do_constant_folding=True, # whether to execute constant folding for optimization\n", " input_names=['input_ids', # the model's input names\n", " 'input_mask', \n", " 'segment_ids'],\n", " output_names=['start', 'end'], # the model's output names\n", " dynamic_axes={'input_ids': symbolic_names, # variable length axes\n", " 'input_mask' : symbolic_names,\n", " 'segment_ids' : symbolic_names,\n", " 'start' : symbolic_names,\n", " 'end' : symbolic_names})\n", " print(\"Model exported at \", export_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. PyTorch Inference ##\n", "Use PyTorch to evaluate an example input for comparison purpose." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PyTorch cuda Inference time = 16.57 ms\n" ] } ], "source": [ "import time\n", "\n", "# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.\n", "latency = []\n", "with torch.no_grad():\n", " for i in range(total_samples):\n", " data = dataset[i]\n", " inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", " }\n", " start = time.time()\n", " outputs = model(**inputs)\n", " latency.append(time.time() - start)\n", "print(\"PyTorch {} Inference time = {} ms\".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Inference ONNX Model with ONNX Runtime ##\n", "\n", "### CUDA and cuDNN Path\n", "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn):\n", "\n", "* [onnxruntime-gpu v1.3.0](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "* [onnxruntime-gpu v1.2.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.2.0) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "\n", "During installing PyTorch 1.5, we installed cudatoolkit 10.1.243 in this conda environment. That shall be good for onnxruntime-gpu 1.3.0 in Jupyter Notebook." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n", "add_cuda_path = False\n", "\n", "if add_cuda_path:\n", " # Add path of CUDA 10.0 and CUDNN 7.6 for onnxruntime-gpu 1.0.0 ~ 1.1.2\n", " cuda_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " cudnn_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n", " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n", " else:\n", " if cuda_dir == cudnn_dir:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n", " else:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### OpenMP Environment Variable\n", "\n", "OpenMP environment variables are optional for GPU inference of standard Bert model. It has little performance impact on Bert model since most nodes are executed in GPU. \n", "\n", "You can find the best setting based on [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n", "\n", "**Attention: Setting environment variables shall be done before importing onnxruntime**. Otherwise, they might not take effect." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Optional. You can change them according to Performance Test Tool result.\n", "#os.environ[\"OMP_NUM_THREADS\"] = '1'\n", "#os.environ[\"OMP_WAIT_POLICY\"] = 'PASSIVE'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are ready to inference the model with ONNX Runtime." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OnnxRuntime gpu Inference time = 4.43 ms\n" ] } ], "source": [ "import psutil\n", "import onnxruntime\n", "import numpy\n", "\n", "assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()\n", "device_name = 'gpu'\n", "\n", "sess_options = onnxruntime.SessionOptions()\n", "\n", "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n", "# Note that this will increase session creation time so enable it for debugging only.\n", "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_{}.onnx\".format(device_name))\n", "\n", "# Please change the value according to best setting in Performance Test Tool result.\n", "sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n", "\n", "session = onnxruntime.InferenceSession(export_model_path, sess_options)\n", "\n", "latency = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n", " ort_inputs = {\n", " 'input_ids': data[0].cpu().reshape(1, max_seq_length).numpy(),\n", " 'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n", " 'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()\n", " }\n", " start = time.time()\n", " ort_outputs = session.run(None, ort_inputs)\n", " latency.append(time.time() - start)\n", " \n", "print(\"OnnxRuntime {} Inference time = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can compare the output of PyTorch and ONNX Runtime. We can see some results are not close. It is because ONNX Runtime uses some approximation in CUDA optimization. Based on our evaluation on SQuAD data set, F1 score is on par for models before and after optimization." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Verifying correctness *****\n", "PyTorch and ONNX Runtime output 0 are close: True\n", "maximum_diff=9.499490261077881e-07 average_diff=1.4225952327251434e-07\n", "PyTorch and ONNX Runtime output 1 are close: True\n", "maximum_diff=6.92903995513916e-07 average_diff=1.2441887520253658e-07\n" ] } ], "source": [ "print(\"***** Verifying correctness *****\")\n", "for i in range(2): \n", " print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[i], outputs[i].cpu(), rtol=1e-02, atol=1e-02))\n", " diff = ort_outputs[i] - outputs[i].cpu().numpy()\n", " max_diff = numpy.max(numpy.abs(diff))\n", " avg_diff = numpy.average(numpy.abs(diff))\n", " print(f'maximum_diff={max_diff} average_diff={avg_diff}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inference with Actual Sequence Length\n", "Note that ONNX model is exported using dynamic length axis. It is recommended to use actual sequence input without padding instead of fixed length input for best performance. Let's see how it can be applied to this model.\n", "\n", "From an example input below, we can see zero padding at the end of each sequence." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[ 101, 1293, 1242, 2557, 1127, 1226, 1104, 1103, 3613, 16429,\n", " 5235, 136, 102, 3613, 16429, 5988, 170, 107, 1353, 1671,\n", " 1992, 1342, 107, 5235, 117, 1107, 1134, 1473, 3683, 3538,\n", " 1125, 170, 1476, 118, 1248, 2595, 4086, 1714, 1104, 2965,\n", " 15897, 1104, 3613, 16429, 119, 1473, 3683, 3538, 3222, 1149,\n", " 2551, 1168, 23759, 1116, 1121, 1506, 1103, 10280, 2231, 1111,\n", " 1103, 1714, 16355, 119, 102, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]],\n", " device='cuda:0'),\n", " 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n", " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# An example input (we can see padding). From attention_mask, we can deduce the actual length.\n", "inputs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The original sequence length is 128. After removing paddings, the sequence length is reduced. Input with smaller sequence length need less computation, thus we can see there is improvement on inference latency. " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average length 101\n", "OnnxRuntime gpu Inference time with actual sequence length = 4.23 ms\n" ] } ], "source": [ "import statistics\n", "\n", "latency = []\n", "lengths = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # Instead of using fixed length (128), we can use actual sequence length (less than 128), which helps to get better performance.\n", " actual_sequence_length = sum(data[1].numpy())\n", " lengths.append(actual_sequence_length)\n", " opt_inputs = {\n", " 'input_ids': data[0].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'input_mask': data[1].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'segment_ids': data[2].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length)\n", " }\n", " start = time.time()\n", " opt_outputs = session.run(None, opt_inputs)\n", " latency.append(time.time() - start)\n", "print(\"Average length\", statistics.mean(lengths))\n", "print(\"OnnxRuntime {} Inference time with actual sequence length = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's compare the output and see whether the results are close.\n", "\n", "**Note**: Need end-to-end evaluation on performance and accuracy if you use this strategy." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Comparing results with/without paddings *****\n", "Output 0 are close: True\n", "Output 1 are close: True\n" ] } ], "source": [ "print(\"***** Comparing results with/without paddings *****\")\n", "for i in range(2):\n", " print('Output {} are close:'.format(i), numpy.allclose(opt_outputs[i], ort_outputs[i][:,:len(opt_outputs[i][0])], rtol=1e-03, atol=1e-03))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Offline Optimization and Test Tools\n", "\n", "It is recommended to try [OnnxRuntime Transformer Model Optimization Tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) on the exported ONNX models. It could help verify whether the model can be fully optimized, and get performance test results." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Transformer Optimizer\n", "\n", "Although OnnxRuntime could optimize Bert model exported by PyTorch. Sometime, model cannot be fully optimized due to different reasons:\n", "* A new subgraph pattern is generated by new version of export tool, and the pattern is not covered by older version of OnnxRuntime. \n", "* The exported model uses dynamic axis and this makes it harder for shape inference of the graph. That blocks some optimization to be applied.\n", "* Some optimization is better to be done offline. Like change input tensor type from int64 to int32 to avoid extra Cast nodes, or convert model to float16 to achieve better performance in V100 or T4 GPU.\n", "\n", "We have python script **optimizer.py**, which is more flexible in graph pattern matching and model conversion (like float32 to float16). You can also use it to verify whether a Bert model is fully optimized.\n", "\n", "In this example, we can see that it introduces optimization that is not provided by onnxruntime: SkipLayerNormalization and bias fusion, which is not fused in OnnxRuntime due to shape inference as mentioned.\n", "\n", "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph.\n", "\n", "Example Usage:\n", "```\n", "from onnxruntime_tools import optimizer\n", "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n", "optimized_model.save_model_to_file(optimized_model_path)\n", "```\n", "\n", "You can also use optimizer_cli like the following:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Float32 Model\n", "Let us optimize the ONNX model using the script. The first example will output model with float32 to store weights. This is the choice for most GPUs without Tensor Core.\n", "\n", "If your GPU (like V100 or T4) has Tensor Core, jump to [Float16 Model](#6.-Model-Optimization-with-Float16) section since that will give you better performance than Float32 model." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n", "\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Optimized Graph\n", "We can open the optimized model using [Netron](https://github.com/lutzroeder/netron) to visualize.\n", "\n", "The graph is like the following:\n", "\n", "\n", "Sometime, optimized graph is slightly different. For example, FastGelu is replaced by BiasGelu for CPU inference; When the option --input_int32 is used, Cast nodes for inputs are removed." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import netron\n", "\n", "# change it to True if want to view the optimized model in browser\n", "enable_netron = False\n", "if enable_netron:\n", " # If you encounter error \"access a socket in a way forbidden by its access permissions\", install Netron as standalone application instead.\n", " netron.start(optimized_fp32_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Performance Test Tool\n", "\n", "The following will create 1000 random inputs of batch_size 1 and sequence length 128, then measure the average latency and throughput numbers.\n", "\n", "Note that the test uses fixed sequence length. If you use [dynamic sequence length](#Inference-with-Actual-Sequence-Length), actual performance depends on the distribution of sequence length.\n", "\n", "**Attention**: Latency numbers from Jupyter Notebook are not accurate. See [Attional Info](#7.-Additional-Info) for more info." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.92 ms, Throughput = 203.24 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.90 ms, Throughput = 203.88 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 5.07 ms, Throughput = 197.16 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.82 ms, Throughput = 207.33 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.93 ms, Throughput = 202.92 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.91 ms, Throughput = 203.55 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.88 ms, Throughput = 204.90 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's load the summary file and take a look. Note that blank value in OMP_NUM_THREADS or OMP_WAIT_POLICY means the environment variable does not exist." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
04.824.534.575.157.258.75207.33112ACTIVENoneTrue
14.884.544.586.477.138.68204.901212PASSIVENoneTrue
24.904.544.576.167.648.82203.88112PASSIVENoneTrue
34.914.554.596.707.438.78203.551212ACTIVENoneTrue
44.924.574.606.507.828.90203.240NoneTrue
54.934.554.596.667.578.80202.92121PASSIVENoneTrue
65.074.564.617.198.119.01197.16121ACTIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 4.82 4.53 4.57 5.15 7.25 \n", "1 4.88 4.54 4.58 6.47 7.13 \n", "2 4.90 4.54 4.57 6.16 7.64 \n", "3 4.91 4.55 4.59 6.70 7.43 \n", "4 4.92 4.57 4.60 6.50 7.82 \n", "5 4.93 4.55 4.59 6.66 7.57 \n", "6 5.07 4.56 4.61 7.19 8.11 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 8.75 207.33 1 12 \n", "1 8.68 204.90 12 12 \n", "2 8.82 203.88 1 12 \n", "3 8.78 203.55 12 12 \n", "4 8.90 203.24 0 \n", "5 8.80 202.92 12 1 \n", "6 9.01 197.16 12 1 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 PASSIVE None True \n", "2 PASSIVE None True \n", "3 ACTIVE None True \n", "4 None True \n", "5 PASSIVE None True \n", "6 ACTIVE None True " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From above result, we can see that latency is very close for different settings. The default setting (intra_op_num_threads=0, OMP_NUM_THREADS and OMP_WAIT_POLICY does not exist) performs the best. \n", "\n", "### Model Results Comparison Tool\n", "\n", "When a BERT model is optimized, some approximation is used in calculation. If your BERT model has three inputs, a script compare_bert_results.py can be used to do a quick verification. The tool will generate some fake input data, and compare the inference outputs of the original and optimized models. If outputs are all close, it is safe to use the optimized model.\n", "\n", "For GPU inference, the absolute or relative difference is larger than those numbers of CPU inference. Note that slight difference in output will not impact final result. We did end-to-end evaluation using SQuAD data set using a fine-tuned squad model, and F1 score is almost the same before/after optimization." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n", "maximum absolute difference=1.9222497940063477e-06\r\n", "maximum relative difference=0.05027933046221733\r\n" ] } ], "source": [ "!python -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Model Optimization with Float16\n", "\n", "The optimizer.py script have an option **--float16** to convert model to use float16 to store weights. After the conversion, it could be faster to run in GPU with tensor cores like V100 or T4.\n", "\n", "Let's run tools to measure the performance on V100. The results show significant performance improvement: latency is about 3.4 ms for float32 model, and 1.8 ms for float16 model." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.90 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.12 ms, Throughput = 320.00 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.02 ms, Throughput = 331.39 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 332.53 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 328.67 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.72 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 329.32 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
03.012.792.812.865.087.16332.53112ACTIVENoneTrue
13.012.802.812.884.527.05331.900NoneTrue
23.012.782.802.925.017.02331.721212ACTIVENoneTrue
33.022.792.802.856.347.04331.39121ACTIVENoneTrue
43.042.802.822.935.567.08329.321212PASSIVENoneTrue
53.042.792.812.926.377.08328.67121PASSIVENoneTrue
63.122.792.822.966.667.20320.00112PASSIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.01 2.79 2.81 2.86 5.08 \n", "1 3.01 2.80 2.81 2.88 4.52 \n", "2 3.01 2.78 2.80 2.92 5.01 \n", "3 3.02 2.79 2.80 2.85 6.34 \n", "4 3.04 2.80 2.82 2.93 5.56 \n", "5 3.04 2.79 2.81 2.92 6.37 \n", "6 3.12 2.79 2.82 2.96 6.66 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 7.16 332.53 1 12 \n", "1 7.05 331.90 0 \n", "2 7.02 331.72 12 12 \n", "3 7.04 331.39 12 1 \n", "4 7.08 329.32 12 12 \n", "5 7.08 328.67 12 1 \n", "6 7.20 320.00 1 12 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 None True \n", "2 ACTIVE None True \n", "3 ACTIVE None True \n", "4 PASSIVE None True \n", "5 PASSIVE None True \n", "6 PASSIVE None True " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Throughput Tuning\n", "\n", "Some application need best throughput under some constraint on latency. This can be done by testing performance of different batch sizes. The tool could help on this.\n", "\n", "Here is an example that check the performance of multiple batch sizes (1, 2, 4, 8, 16, 32 and 64) using default settings." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=32 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 16.17 ms, Throughput = 1979.41 QPS\n", "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.00 ms, Throughput = 333.83 QPS\n", "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=2 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.59 ms, Throughput = 557.32 QPS\n", "test setting TestSetting(batch_size=64, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=64 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=64,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 29.26 ms, Throughput = 2187.15 QPS\n", "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=4 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.32 ms, Throughput = 926.92 QPS\n", "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=8 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 6.32 ms, Throughput = 1266.63 QPS\n", "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=16 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 9.60 ms, Throughput = 1666.05 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $THREAD_SETTING $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)batch_size
03.002.792.812.864.377.08333.831
13.593.333.353.426.607.54557.322
24.323.984.014.647.238.11926.924
36.325.945.977.618.9610.121266.638
49.609.229.2511.3212.3313.341666.0516
516.1715.8015.9017.3818.8019.931979.4132
629.2628.8929.0130.6332.5333.282187.1564
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.00 2.79 2.81 2.86 4.37 \n", "1 3.59 3.33 3.35 3.42 6.60 \n", "2 4.32 3.98 4.01 4.64 7.23 \n", "3 6.32 5.94 5.97 7.61 8.96 \n", "4 9.60 9.22 9.25 11.32 12.33 \n", "5 16.17 15.80 15.90 17.38 18.80 \n", "6 29.26 28.89 29.01 30.63 32.53 \n", "\n", " Latency_P99 Throughput(QPS) batch_size \n", "0 7.08 333.83 1 \n", "1 7.54 557.32 2 \n", "2 8.11 926.92 4 \n", "3 10.12 1266.63 8 \n", "4 13.34 1666.05 16 \n", "5 19.93 1979.41 32 \n", "6 33.28 2187.15 64 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float16 model summary from\", latest_result_file)\n", "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'warmup', 'sequence_length']\n", "columns_to_remove.extend(['intra_op_num_threads', 'OMP_NUM_THREADS', 'OMP_WAIT_POLICY', 'contiguous'])\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Additional Info\n", "\n", "Note that running Jupyter Notebook has significant impact on performance result. You can close Jupyter Notebook and other applications, then run the performance test in a console to get more accurate performance numbers.\n", "\n", "We have a [benchmark script](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/run_benchmark.sh). It is recommended to use it measure inference speed of OnnxRuntime.\n", "\n", "[OnnxRuntime C API](https://github.com/microsoft/onnxruntime/blob/master/docs/C_API.md) could get slightly better performance than python API. If you use C API in inference, you can use OnnxRuntime_Perf_Test.exe built from source to measure performance instead.\n", "\n", "Here is the machine configuration that generated the above results. You might get slower or faster result according to your hardware." ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\r\n", " \"gpu\": {\r\n", " \"driver_version\": \"440.64.00\",\r\n", " \"devices\": [\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 14110883840,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " },\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 16932601856,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " }\r\n", " ]\r\n", " },\r\n", " \"cpu\": {\r\n", " \"brand\": \"Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\",\r\n", " \"cores\": 12,\r\n", " \"logical_cores\": 12,\r\n", " \"hz\": \"2.5940 GHz\",\r\n", " \"l2_cache\": \"256 KB\",\r\n", " \"l3_cache\": \"35840 KB\",\r\n", " \"processor\": \"x86_64\"\r\n", " },\r\n", " \"memory\": {\r\n", " \"total\": 236645588992,\r\n", " \"available\": 222567559168\r\n", " },\r\n", " \"python\": \"3.7.7.final.0 (64 bit)\",\r\n", " \"os\": \"Linux-4.15.0-1089-azure-x86_64-with-debian-stretch-sid\",\r\n", " \"onnxruntime\": {\r\n", " \"version\": \"1.3.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"pytorch\": {\r\n", " \"version\": \"1.5.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"tensorflow\": null\r\n", "}\r\n" ] } ], "source": [ "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent" ] } ], "metadata": { "kernelspec": { "display_name": "PyCharm (ccks_ner-master)", "language": "python", "name": "pycharm-de4c0941" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: code/bert-base-count3/finetuning/Config.py ================================================ from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig, ElectraModel, ElectraConfig, ElectraTokenizer, \ RobertaTokenizer, RobertaModel, RobertaConfig from NEZHA.modeling_nezha import NeZhaModel from NEZHA.configuration_nezha import NeZhaConfig MODELS = { 'BertForClass': BertModel, 'BertForClass_MultiDropout': BertModel, 'BertLastTwoCls': BertModel, 'BertLastCls':BertModel, 'BertLastTwoClsPooler': BertModel, 'BertLastTwoEmbeddings': BertModel, 'BertLastTwoEmbeddingsPooler': BertModel, 'BertLastFourCls': BertModel, 'BertLastFourClsPooler': BertModel, 'BertLastFourEmbeddings': BertModel, 'BertLastFourEmbeddingsPooler': BertModel, 'BertDynCls': BertModel, 'BertDynEmbeddings': BertModel, 'BertRNN': BertModel, 'BertCNN': XLNetModel, 'BertRCNN': BertModel, 'XLNet': XLNetModel, 'Electra': ElectraModel, 'NEZHA': NeZhaModel } TOKENIZERS = { 'BertForClass': BertTokenizer, 'BertForClass_MultiDropout': BertTokenizer, 'BertLastTwoCls': BertTokenizer, 'BertLastCls': BertTokenizer, 'BertLastTwoClsPooler': BertTokenizer, 'BertLastTwoEmbeddings': BertTokenizer, 'BertLastTwoEmbeddingsPooler': BertTokenizer, 'BertLastFourCls': BertTokenizer, 'BertLastFourClsPooler': BertTokenizer, 'BertLastFourEmbeddings': BertTokenizer, 'BertLastFourEmbeddingsPooler': BertTokenizer, 'BertDynCls': BertTokenizer, 'BertDynEmbeddings': BertTokenizer, 'BertRNN': BertTokenizer, 'BertCNN': BertTokenizer, 'BertRCNN': BertTokenizer, 'XLNet': XLNetTokenizer, 'Electra': ElectraTokenizer, 'NEZHA': BertTokenizer } CONFIGS = { 'BertForClass': BertConfig, 'BertForClass_MultiDropout': BertConfig, 'BertLastTwoCls': BertConfig, 'BertLastCls': BertConfig, 'BertLastTwoClsPooler': BertConfig, 'BertLastTwoEmbeddings': BertConfig, 'BertLastTwoEmbeddingsPooler': BertConfig, 'BertLastFourCls': BertConfig, 'BertLastFourClsPooler': BertConfig, 'BertLastFourEmbeddings': BertConfig, 'BertLastFourEmbeddingsPooler': BertConfig, 'BertDynCls': BertConfig, 'BertDynEmbeddings': BertConfig, 'BertRNN': BertConfig, 'BertCNN': BertConfig, 'BertRCNN': BertConfig, 'XLNet': XLNetConfig, 'Electra': ElectraConfig, 'NEZHA': NeZhaConfig } ================================================ FILE: code/bert-base-count3/finetuning/NEZHA/configuration_nezha.py ================================================ from transformers import PretrainedConfig NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class NeZhaConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. It is used to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALBERT `xxlarge `__ architecture. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30000): Vocabulary size of the ALBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of vocabulary embeddings. hidden_size (:obj:`int`, optional, defaults to 4096): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_hidden_groups (:obj:`int`, optional, defaults to 1): Number of groups for the hidden layers, parameters in the same group are shared. num_attention_heads (:obj:`int`, optional, defaults to 64): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 16384): The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. inner_group_num (:obj:`int`, optional, defaults to 1): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for attached classifiers. Example:: from transformers import AlbertConfig, AlbertModel # Initializing an ALBERT-xxlarge style configuration albert_xxlarge_configuration = AlbertConfig() # Initializing an ALBERT-base style configuration albert_base_configuration = AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) # Initializing a model from the ALBERT-base style configuration model = AlbertModel(albert_xxlarge_configuration) # Accessing the model configuration configuration = model.config Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "nezha" def __init__( self, vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, num_hidden_groups=1, num_attention_heads=64, intermediate_size=16384, inner_group_num=1, hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, max_relative_position=64, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, use_relative_position=True, pad_token_id=0, bos_token_id=2, eos_token_id=3, **kwargs ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_hidden_groups = num_hidden_groups self.num_attention_heads = num_attention_heads self.inner_group_num = inner_group_num self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.max_relative_position = max_relative_position self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_relative_position=use_relative_position self.classifier_dropout_prob = classifier_dropout_prob ================================================ FILE: code/bert-base-count3/finetuning/NEZHA/modeling_nezha.py ================================================ import math import os import logging import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_nezha import NeZhaConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.modeling_utils import PreTrainedModel, prune_linear_layer from transformers.models.bert.modeling_bert import ( BertOutput, BertPooler, BertSelfOutput, BertIntermediate, BertOnlyMLMHead, BertOnlyNSPHead, BertPreTrainingHeads, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING, ) logger = logging.getLogger(__name__) _CONFIG_FOR_DOC = "NeZhaConfig" _TOKENIZER_FOR_DOC = "NeZhaTokenizer" NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [] NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {} def load_tf_weights_in_nezha(model, config, tf_checkpoint_path): """Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: # logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step", "good_steps", "loss_scale", 'bad_steps'] for n in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert ( pointer.shape == array.shape ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model class NeZhaEmbeddings(nn.Module): """ Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.use_relative_position = config.use_relative_position self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings def relative_position_encoding(depth, max_length=512, max_relative_position=127): vocab_size = max_relative_position * 2 + 1 range_vec = torch.arange(max_length) range_mat = range_vec.repeat(max_length).view(max_length, max_length) distance_mat = range_mat - torch.t(range_mat) distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) final_mat = distance_mat_clipped + max_relative_position embeddings_table = torch.zeros(vocab_size, depth) position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) embeddings_table[:, 0::2] = torch.sin(position * div_term) embeddings_table[:, 1::2] = torch.cos(position * div_term) embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1) flat_relative_positions_matrix = final_mat.view(-1) one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix, num_classes=vocab_size).float() positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) my_shape = list(final_mat.size()) my_shape.append(depth) positions_encoding = positions_encoding.view(my_shape) return positions_encoding class NeZhaSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings, depth=self.attention_head_size, max_relative_position=config.max_relative_position).to('cuda') def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. if encoder_hidden_states is not None: mixed_key_layer = self.key(encoder_hidden_states) mixed_value_layer = self.value(encoder_hidden_states) attention_mask = encoder_attention_mask else: mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] query_layer_t = query_layer.permute(2, 0, 1, 3) query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, self.attention_head_size) key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1)) key_position_scores_r = key_position_scores.view(from_seq_length, batch_size, num_attention_heads, from_seq_length) key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) attention_scores = attention_scores + key_position_scores_r_t attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] attention_probs_t = attention_probs.permute(2, 0, 1, 3) attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, to_seq_length) value_position_scores = torch.matmul(attentions_probs_r, relations_values) value_position_scores_r = value_position_scores.view(from_seq_length, batch_size, num_attention_heads, self.attention_head_size) value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) context_layer = context_layer + value_position_scores_r_t context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class NeZhaAttention(nn.Module): def __init__(self, config): super().__init__() self.self = NeZhaSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class NeZhaLayer(nn.Module): def __init__(self, config): super().__init__() self.attention = NeZhaAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = NeZhaAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights if self.is_decoder and encoder_hidden_states is not None: cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) outputs = (layer_output,) + outputs return outputs class NeZhaEncoder(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)]) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask ) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class NeZhaPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = NeZhaConfig pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_nezha base_model_prefix = "bert" def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class NeZhaModel(NeZhaPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration set to :obj:`True`; an :obj:`encoder_hidden_states` is expected as an input to the forward pass. .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762 """ def __init__(self, config): super().__init__(config) self.config = config self.embeddings = NeZhaEmbeddings(config) self.encoder = NeZhaEncoder(config) self.pooler = BertPooler(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, self.device ) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class NeZhaForPreTraining(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertPreTrainingHeads(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, next_sentence_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForPreTraining import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) # add hidden states and attention if they are here outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] if labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class NeZhaForMaskedLM(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): Next token prediction loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMaskedLM import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here # Although this may seem awkward, BertForMaskedLM supports two scenarios: # 1. If a tensor that contains the indices of masked labels is provided, # the cross-entropy is the MLM cross-entropy that measures the likelihood # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. masked_lm_labels = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # if model is does not use a causal mask then add a dummy token if self.config.is_decoder is False: assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" attention_mask = torch.cat( [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 ) dummy_token = torch.full( (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device ) input_ids = torch.cat([input_ids, dummy_token], dim=1) return {"input_ids": input_ids, "attention_mask": attention_mask} @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, next_sentence_label=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForNextSentencePrediction import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) seq_relationship_scores = outputs[0] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if next_sentence_label is not None: loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) outputs = (next_sentence_loss,) + outputs return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class NeZhaForSequenceClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class NeZhaForMultipleChoice(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMultipleChoice import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class NeZhaForTokenClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForTokenClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class NeZhaForQuestionAnswering(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, position_ids=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForQuestionAnswering import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) assert answer == "a nice puppet" """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/finetuning/model.py ================================================ import torch import random import os from torch import nn, optim import torch.nn.functional as F from transformers.activations import get_activation from Config import * class BertForClass(nn.Module): def __init__(self, config): super(BertForClass, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit class BertForClass_MultiDropout(nn.Module): def __init__(self, config): super(BertForClass_MultiDropout, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.multi_drop = 5 self.multi_dropouts = nn.ModuleList([nn.Dropout(config.dropout) for _ in range(self.multi_drop)]) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) for j, dropout in enumerate(self.multi_dropouts): if j == 0: logit = self.classifier(dropout(concat_out)) / self.multi_drop else: logit += self.classifier(dropout(concat_out)) / self.multi_drop return logit class BertLastTwoCls(nn.Module): def __init__(self, config): super(BertLastTwoCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) logit = self.classifier(pooler_output) return logit class BertLastCls(nn.Module): def __init__(self, config): super(BertLastCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] if self.isDropout: output = self.dropout(pooler_output) logit = self.classifier(output) return logit class BertLastTwoClsPooler(nn.Module): def __init__(self, config): super(BertLastTwoClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddings(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourCls(nn.Module): def __init__(self, config): super(BertLastFourCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] output = torch.cat( (hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourClsPooler(nn.Module): def __init__(self, config): super(BertLastFourClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddings(nn.Module): def __init__(self, config): super(BertLastFourEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastFourEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertDynCls(nn.Module): def __init__(self, config): super(BertDynCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = hidden_states[-(i + 1)][0] weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertDynEmbeddings(nn.Module): def __init__(self, config): super(BertDynEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = torch.mean(hidden_states[-(i + 1)], dim=1) weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertRNN(nn.Module): def __init__(self, config): super(BertRNN, self).__init__() self.rnn_type = "gru" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.1 self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.num_directions = 1 if not self.bidirectional else 2 if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) self.dropout = nn.Dropout(self.drop_out) self.fc_rnn = nn.Linear(self.hidden_dim * self.num_directions, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(sequence_output) else: output, (hidden, cell) = self.rnn(sequence_output) # output = [ batch size, sent len, hidden_dim * bidirectional] batch_size, max_seq_len, hidden_dim = output.shape hidden = torch.transpose(hidden, 1, 0) hidden = torch.mean(torch.reshape(hidden, [batch_size, -1, hidden_dim]), dim=1) output = torch.sum(output, dim=1) fc_input = self.dropout(output + hidden) # output = torch.mean(output, dim=1) # fc_input = self.dropout(output) out = self.fc_rnn(fc_input) return out class BertCNN(nn.Module): def __init__(self, config): super(BertCNN, self).__init__() self.num_filters = 100 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.hidden_size = self.bert_config.to_dict()['hidden_size'] self.filter_sizes = {3, 4, 5} self.drop_out = 0.5 self.convs = nn.ModuleList( [nn.Conv2d(1, self.num_filters, (k, self.hidden_size)) for k in self.filter_sizes]) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dropout = nn.Dropout(self.drop_out) self.fc_cnn = nn.Linear(self.num_filters * len(self.filter_sizes), config.num_class) def conv_and_pool(self, x, conv): x = F.relu(conv(x)).squeeze(3) x = F.max_pool1d(x, x.size(2)).squeeze(2) return x def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = self.dropout(sequence_output) out = sequence_output.unsqueeze(1) out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) out = self.dropout(out) out = self.fc_cnn(out) return out class BertRCNN(nn.Module): def __init__(self, config): super(BertRCNN, self).__init__() self.rnn_type = "lstm" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.5 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) # self.maxpool = nn.MaxPool1d() self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.fc = nn.Linear(self.hidden_dim * self.n_layers, config.num_class) self.dropout = nn.Dropout(self.drop_out) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sentence_len = sequence_output.shape[1] pooler_output = pooler_output.unsqueeze(dim=1).repeat(1, sentence_len, 1) bert_sentence = sequence_output + pooler_output self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(bert_sentence) else: output, (hidden, cell) = self.rnn(bert_sentence) batch_size, max_seq_len, hidden_dim = output.shape out = torch.transpose(output.relu(), 1, 2) out = F.max_pool1d(out, max_seq_len).squeeze() out = self.fc(out) return out class XLNet(nn.Module): def __init__(self, config): super(XLNet, self).__init__() self.xlnet = XLNetModel.from_pretrained(config.model_path) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.fc = nn.Linear(self.xlnet.d_model, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output = self.xlnet(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = torch.sum(sequence_output[0], dim=1) if self.isDropout: sequence_output = self.dropout(sequence_output) out = self.fc(sequence_output) return out class ElectraClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x class Electra(nn.Module): def __init__(self, config): super(Electra, self).__init__() self.electra = ElectraModel.from_pretrained(config.model_path) config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.electra_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) self.electra_config.num_labels = config.num_class self.fc = ElectraClassificationHead(self.electra_config) def forward(self, input_ids, input_masks, segment_ids): discriminator_hidden_states = self.electra(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = discriminator_hidden_states[0] out = self.fc(sequence_output) return out class NEZHA(nn.Module): def __init__(self, config): super(NEZHA, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) #self.bert_model = MODELS[config.model](config=self.bert_config) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) # NEZHA init #torch_init_model(self.bert_model, os.path.join(config.model_path, 'pytorch_model.bin')) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit ================================================ FILE: code/bert-base-count3/finetuning/models/gitkeep ================================================ ================================================ FILE: code/bert-base-count3/finetuning/multi_gpu_QA.py ================================================ from tqdm import tqdm, trange import numpy as np import pandas as pd import logging import torch import random import os from torch import nn, optim from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig from transformers.optimization import get_linear_schedule_with_warmup from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score from model import * from utils import * import time import logging logging.basicConfig(level=logging.DEBUG, filename="train.log",filemode='a') from NEZHA.modeling_nezha import * MODEL_CLASSES = { 'BertForClass': BertForClass, 'BertLastCls': BertLastCls, 'BertLastTwoCls': BertLastTwoCls, 'BertLastTwoClsPooler': BertLastTwoClsPooler, 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 'BertLastFourCls': BertLastFourCls, 'BertLastFourClsPooler': BertLastFourClsPooler, 'BertLastFourEmbeddings': BertLastFourEmbeddings, 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 'BertDynCls': BertDynCls, 'BertDynEmbeddings': BertDynEmbeddings, 'BertRNN': BertRNN, 'BertCNN': BertCNN, 'BertRCNN': BertRCNN, 'XLNet': XLNet, 'Electra': Electra, 'NEZHA': NEZHA, } class Config: def __init__(self): # 预训练模型路径 self.modelId = 2 self.model = "BertForClass" self.Stratification = False self.model_path = '../pretrain/bert_model/' self.num_class = 2 self.dropout = 0.2 self.MAX_LEN = 32 self.epoch = 3 self.learn_rate = 4e-5 self.normal_lr = 1e-4 self.batch_size = 32 self.k_fold = 10 self.seed = 42 self.device = torch.device('cuda') # self.device = torch.device('cpu') self.focalloss = False self.pgd = False self.fgm = True config = Config() os.environ['PYTHONHASHSEED']='0'#消除hash算法的随机性 random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) file_path = './log/' # 创建一个logger logger = logging.getLogger('mylogger') logger.setLevel(logging.DEBUG) train = pd.read_csv('/tcdata/gaiic_track3_round1_train_20210228.tsv',sep='\t',header=None) semi = pd.read_csv('/tcdata/gaiic_track3_round2_train_20210407.tsv',sep='\t',header=None) train = pd.concat([train, semi], sort=False) train.columns=['q1','q2','label'] train_query1 = train['q1'].values.astype(str) train_query2 = train['q2'].values.astype(str) train_label = train['label'].values.astype(int) oof_train = np.zeros((len(train), config.num_class), dtype=np.float32) #kf = StratifiedKFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) kf = KFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) for fold, (train_index, valid_index) in enumerate(kf.split(train_query1, train_label)): print('\n\n------------fold:{}------------\n'.format(fold)) ''' q1 = train_query1[train_index] q2 = train_query2[train_index] y = train_label[train_index] ''' q1 = train_query1 q2 = train_query2 y = train_label val_q1 = train_query1[valid_index] val_q2 = train_query2[valid_index] val_y = train_label[valid_index] train_D = data_generator([q1, q2, y], config, shuffle=True) val_D = data_generator([val_q1, val_q2, val_y], config) model = MODEL_CLASSES[config.model](config).to(config.device) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) if config.pgd: pgd = PGD(model) K = 3 elif config.fgm: fgm = FGM(model) if config.focalloss: loss_fn = FocalLoss(config.num_class) else: loss_fn = nn.CrossEntropyLoss() # BCEWithLogitsLoss就是把Sigmoid-BCELoss合成一步 num_train_steps = int(len(train) / config.batch_size * config.epoch) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if config.Stratification: bert_params = [x for x in param_optimizer if 'bert' in x[0]] normal_params = [p for n, p in param_optimizer if 'bert' not in n] optimizer_parameters = [ {'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': normal_params, 'lr': config.normal_lr}, ] else: optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr=config.learn_rate) # lr为全局学习率 scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(len(train) / config.batch_size / 2), num_training_steps=num_train_steps ) best_auc = 0 PATH = './models/bert_{}.pth'.format(fold) save_model_path = './models/' if not os.path.exists(save_model_path): os.makedirs(save_model_path) for e in range(config.epoch): print('\n------------epoch:{}------------'.format(e)) model.train() acc = 0 train_len = 0 loss_num = 0 tq = tqdm(train_D,ncols=70,disable=True) last=time.time() for input_ids, input_masks, segment_ids, labels in tq: label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) loss = loss_fn(y_pred, label_t) loss = loss.mean() loss.backward() if config.pgd: pgd.backup_grad() # 对抗训练 for t in range(K): pgd.attack(is_first_attack=(t == 0)) # 在embedding上添加对抗扰动, first attack时备份param.data if t != K - 1: model.zero_grad() else: pgd.restore_grad() y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 elif config.fgm: # 对抗训练 fgm.attack() # 在embedding上添加对抗扰动 y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 # 梯度下降,更新参数 optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() y_pred = np.argmax(y_pred.detach().to("cpu").numpy(), axis=1) acc += sum(y_pred == labels) loss_num += loss.item() train_len += len(labels) tq.set_postfix(fold=fold, epoch=e, loss=loss_num / train_len, acc=acc / train_len) print(f"微调第{e}轮耗时:{time.time()-last}") model.eval() with torch.no_grad(): y_p = [] y_l = [] train_logit = None for input_ids, input_masks, segment_ids, labels in tqdm(val_D,disable=True): label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) y_pred = F.softmax(y_pred) y_pred = y_pred.detach().to("cpu").numpy() if train_logit is None: train_logit = y_pred else: train_logit = np.vstack((train_logit, y_pred)) y_p += list(y_pred[:,1]) y_pred = np.argmax(y_pred, axis=1) y_l += list(y_pred) f1 = f1_score(val_y, y_l, average="macro") auc_score = roc_auc_score(val_y, y_p) print("best_auc:{} auc_score:{} f1:{}\n".format(best_auc, auc_score, f1)) if auc_score >= best_auc: best_auc = auc_score oof_train[valid_index] = np.array(train_logit) #torch.save(model.module.state_dict() if hasattr(model, "module") else model.state_dict(), PATH) torch.save(model.module if hasattr(model, "module") else model, PATH) optimizer.zero_grad() del model torch.cuda.empty_cache() break ================================================ FILE: code/bert-base-count3/finetuning/utils.py ================================================ import torch from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig import numpy as np import os import random from Config import * import torch import torch.nn as nn import torch.nn.functional as F def paddingList(ls:list,val,returnTensor=False): ls=ls[:]#不要改变了原list尺寸 maxLen=max([len(i) for i in ls]) for i in range(len(ls)): ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) return torch.tensor(ls,device='cuda') if returnTensor else ls def fastTokenizer(a:str,b:str,maxLen,tk): a,b=a.split(),b.split() a,b=tk.convert_tokens_to_ids(a),tk.convert_tokens_to_ids(b) maxLen-=3#空留给cls sep sep assert maxLen>=0 len2=maxLen//2#若为奇数,更长部分给左边 len1=maxLen-len2 #一共就a超长与否,b超长与否,组合的四种情况 if len(a)+len(b)>maxLen:#需要截断 if len(a)<=len1 and len(b)>len2: b=b[:maxLen-len(a)] elif len(a)>len1 and len(b)<=len2: a=a[:maxLen-len(b)] elif len(a)>len1 and len(b)>len2: a=a[:len1] b=b[:len2] input_ids=[tk.cls_token_id]+a+[tk.sep_token_id]+b+[tk.sep_token_id] token_type_ids=[0]*(len(a)+2)+[1]*(len(b)+1) return {'input_ids': input_ids, 'token_type_ids': token_type_ids} class data_generator: def __init__(self, data, config, shuffle=False): self.data = data self.batch_size = config.batch_size self.max_length = config.MAX_LEN self.shuffle = shuffle vocab = 'vocab.txt' if os.path.exists(config.model_path + 'vocab.txt') else 'spiece.model' self.tokenizer = TOKENIZERS[config.model].from_pretrained(config.model_path + vocab) self.steps = len(self.data[0]) // self.batch_size if len(self.data[0]) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self): q1, q2, y = self.data idxs = list(range(len(self.data[0]))) if self.shuffle: np.random.shuffle(idxs) input_ids, input_masks, segment_ids, labels = [], [], [], [] for index, i in enumerate(idxs): text = q1[i] text_pair = q2[i] ''' # text = self.tokenizer(text, text_pair, padding='max_length', truncation=True, max_length=self.max_length) text = fastTokenizer(text, text_pair, self.max_length, self.tokenizer) input_ids.append(text['input_ids']) segment_ids.append(text['token_type_ids']) input_masks.append([1] * len(text['input_ids'])) # bs为1时无padding,全1 yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] ''' tkRes = self.tokenizer(text, text_pair, max_length=self.max_length, truncation='longest_first', return_attention_mask=False) input_id = tkRes['input_ids'] segment_id = tkRes['token_type_ids'] assert len(segment_id) == len(input_id) input_ids.append(input_id) segment_ids.append(segment_id) labels.append(y[i]) if len(input_ids) == self.batch_size or i == idxs[-1]: input_ids = paddingList(input_ids, 0, returnTensor=True) # 动态padding segment_ids = paddingList(segment_ids, 0, returnTensor=True) input_masks = (input_ids != 0) yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] class PGD(): def __init__(self, model): self.model = model self.emb_backup = {} self.grad_backup = {} def attack(self, epsilon=0.3, alpha=0.1, emb_name='word_embeddings', is_first_attack=False): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: if is_first_attack: self.emb_backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0 and not torch.isnan(norm): r_at = alpha * param.grad / norm param.data.add_(r_at) param.data = self.project(name, param.data, epsilon) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.emb_backup param.data = self.emb_backup[name] self.emb_backup = {} def project(self, param_name, param_data, epsilon): r = param_data - self.emb_backup[param_name] if torch.norm(r) > epsilon: r = epsilon * r / torch.norm(r) return self.emb_backup[param_name] + r def backup_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: self.grad_backup[name] = param.grad.clone() def restore_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: param.grad = self.grad_backup[name] class FGM(): def __init__(self, model): self.model = model self.backup = {} def attack(self, epsilon=0.25, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: self.backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0: r_at = epsilon * param.grad / norm param.data.add_(r_at) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.backup param.data = self.backup[name] self.backup = {} # 支持多分类和二分类 class FocalLoss(nn.Module): """ This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' Focal_Loss= -1*alpha*(1-pt)^gamma*log(pt) :param num_class: :param alpha: (tensor) 3D or 4D the scalar factor for this criterion :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more focus on hard misclassified example :param smooth: (float,double) smooth value when cross entropy :param balance_index: (int) balance class index, should be specific when alpha is float :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch. """ def __init__(self, num_class, alpha=None, gamma=2, smooth=None, size_average=True): super(FocalLoss, self).__init__() self.num_class = num_class self.alpha = alpha self.gamma = gamma self.smooth = smooth self.size_average = size_average if self.alpha is None: self.alpha = torch.ones(self.num_class, 1) elif isinstance(self.alpha, (list, np.ndarray)): assert len(self.alpha) == self.num_class self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1) self.alpha = self.alpha / self.alpha.sum() else: raise TypeError('Not support alpha type') if self.smooth is not None: if self.smooth < 0 or self.smooth > 1.0: raise ValueError('smooth value should be in [0,1]') def forward(self, input, target): logit = F.softmax(input, dim=1) if logit.dim() > 2: # N,C,d1,d2 -> N,C,m (m=d1*d2*...) logit = logit.view(logit.size(0), logit.size(1), -1) logit = logit.permute(0, 2, 1).contiguous() logit = logit.view(-1, logit.size(-1)) target = target.view(-1, 1) # N = input.size(0) # alpha = torch.ones(N, self.num_class) # alpha = alpha * (1 - self.alpha) # alpha = alpha.scatter_(1, target.long(), self.alpha) epsilon = 1e-10 alpha = self.alpha if alpha.device != input.device: alpha = alpha.to(input.device) idx = target.cpu().long() one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() one_hot_key = one_hot_key.scatter_(1, idx, 1) if one_hot_key.device != logit.device: one_hot_key = one_hot_key.to(logit.device) if self.smooth: one_hot_key = torch.clamp( one_hot_key, self.smooth, 1.0 - self.smooth) pt = (one_hot_key * logit).sum(1) + epsilon logpt = pt.log() gamma = self.gamma alpha = alpha[idx] loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt if self.size_average: loss = loss.mean() else: loss = loss.sum() return loss def f1_match(y_true,y_pred): acc = sum(y_pred & y_true) / (sum(y_pred)) rec = sum(y_pred & y_true) / (sum(y_true)) return 2 * acc * rec /(acc + rec) ================================================ FILE: code/bert-base-count3/pretrain/NLP_Utils.py ================================================ import random import json import transformers as _ from transformers1 import BertTokenizer import torch from torch.utils.data import Dataset,DataLoader import numpy as np from itertools import chain def writeToJsonFile(path: str, obj): with open(path, "w", encoding="utf-8") as f: f.write(json.dumps(obj, ensure_ascii=False,indent=0)) def readFromJsonFile(path: str): with open(path, "r", encoding="utf-8") as f: return json.loads(f.read()) def loadData(path): allData=[] with open(path,"r") as f: for i in f: i=i.strip().split('\t') if len(i)==0:#防止空行 break if len(i)==3:#训练集 a,b,label=i a=a.split(' ') b=b.split(' ') else:#测试集,直接转为id形式 a,b,label=i[0],i[1],-1 a=a.split(' ') b=b.split(' ') allData.append([a,b,label]) return allData def calNegPos(ls):#计算正负比例 posNum,negNum=0,0 for i in ls: if i[2]==0: negNum+=1 elif i[2]==1: posNum+=1 posNum=1 if posNum==0 else posNum return negNum,posNum,round(negNum/posNum,4) allData=loadData('/tcdata/gaiic_track3_round1_train_20210228.tsv')+loadData('/tcdata/gaiic_track3_round2_train_20210407.tsv') testA_data = loadData('/tcdata/gaiic_track3_round1_testA_20210228.tsv') testB_data = loadData('/tcdata/gaiic_track3_round1_testB_20210317.tsv') random.shuffle(allData) train_data=allData+testA_data+testB_data#全量 valid_data=allData[-20000:] print("训练集样本数量:", len(train_data)) def paddingList(ls:list,val,returnTensor=False): ls=ls[:]#不要改变了原list尺寸 maxLen=max([len(i) for i in ls]) for i in range(len(ls)): ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) return torch.tensor(ls,device='cuda') if returnTensor else ls def truncate(a:list,b:list,maxLen): maxLen-=3#空留给cls sep sep assert maxLen>=0 len2=maxLen//2#若为奇数,更长部分给左边 len1=maxLen-len2 #一共就a超长与否,b超长与否,组合的四种情况 if len(a)+len(b)>maxLen:#需要截断 if len(a)<=len1 and len(b)>len2: b=b[:maxLen-len(a)] elif len(a)>len1 and len(b)<=len2: a=a[:maxLen-len(b)] elif len(a)>len1 and len(b)>len2: a=a[:len1] b=b[:len2] return a,b class MLM_Data(Dataset): #传入句子对列表 def __init__(self,textLs:list,maxLen:int,tk:BertTokenizer): super().__init__() self.data=textLs self.maxLen=maxLen self.tk=tk self.spNum=len(tk.all_special_tokens) self.tkNum=tk.vocab_size def __len__(self): return len(self.data) def random_mask(self,text_ids): input_ids, output_ids = [], [] rands = np.random.random(len(text_ids)) idx=0 while idx0.5: text1,text2=text2,text1#交换位置 text1,text2=truncate(text1,text2,self.maxLen) text1_ids,text2_ids = self.tk.convert_tokens_to_ids(text1),self.tk.convert_tokens_to_ids(text2) text1_ids, out1_ids = self.random_mask(text1_ids)#添加mask预测 text2_ids, out2_ids = self.random_mask(text2_ids) input_ids = [self.tk.cls_token_id] + text1_ids + [self.tk.sep_token_id] + text2_ids + [self.tk.sep_token_id]#拼接 token_type_ids=[0]*(len(text1_ids)+2)+[1]*(len(text2_ids)+1) labels = [-100] + out1_ids + [-100] + out2_ids + [-100] assert len(input_ids)==len(token_type_ids)==len(labels) return {'input_ids':input_ids,'token_type_ids':token_type_ids,'labels':labels} @classmethod def collate(cls,batch): input_ids=[i['input_ids'] for i in batch] token_type_ids=[i['token_type_ids'] for i in batch] labels=[i['labels'] for i in batch] input_ids=paddingList(input_ids,0,returnTensor=True) token_type_ids=paddingList(token_type_ids,0,returnTensor=True) labels=paddingList(labels,-100,returnTensor=True) attention_mask=(input_ids!=0) return {'input_ids':input_ids,'token_type_ids':token_type_ids ,'attention_mask':attention_mask,'labels':labels} unionList=lambda ls:list(chain(*ls))#按元素拼接 splitList=lambda x,bs:[x[i:i+bs] for i in range(0,len(x),bs)]#按bs切分 #sortBsNum:原序列按多少个bs块为单位排序,可用来增强随机性 #比如如果每次打乱后都全体一起排序,那每次都是一样的 def blockShuffle(data:list,bs:int,sortBsNum,key): random.shuffle(data)#先打乱 tail=len(data)%bs#计算碎片长度 tail=[] if tail==0 else data[-tail:] data=data[:len(data)-len(tail)] assert len(data)%bs==0#剩下的一定能被bs整除 sortBsNum=len(data)//bs if sortBsNum is None else sortBsNum#为None就是整体排序 data=splitList(data,sortBsNum*bs) data=[sorted(i,key=key,reverse=True) for i in data]#每个大块进行降排序 data=unionList(data) data=splitList(data,bs)#最后,按bs分块 random.shuffle(data)#块间打乱 data=unionList(data)+tail return data from torch.utils.data.dataloader import _SingleProcessDataLoaderIter,_MultiProcessingDataLoaderIter #每轮迭代重新分块shuffle数据的DataLoader class blockShuffleDataLoader(DataLoader): def __init__(self, dataset: Dataset,sortBsNum,key,**kwargs): assert isinstance(dataset.data,list)#需要有list类型的data属性 super().__init__(dataset,**kwargs)#父类的参数传过去 self.sortBsNum=sortBsNum self.key=key def __iter__(self): #分块shuffle self.dataset.data=blockShuffle(self.dataset.data,self.batch_size,self.sortBsNum,self.key) if self.num_workers == 0: return _SingleProcessDataLoaderIter(self) else: return _MultiProcessingDataLoaderIter(self) ================================================ FILE: code/bert-base-count3/pretrain/__init__.py ================================================ ================================================ FILE: code/bert-base-count3/pretrain/bert_model/gitkeep ================================================ ================================================ FILE: code/bert-base-count3/pretrain/train_bert.py ================================================ # coding:utf-8 import numpy as np import random import os random.seed(0) np.random.seed(0)#seed应该在main里尽早设置,以防万一 os.environ['PYTHONHASHSEED'] =str(0)#消除hash算法的随机性 from transformers import BertForMaskedLM#除nezha外模型用新版加载 from transformers1 import Trainer, TrainingArguments,BertTokenizer,BertConfig from NLP_Utils import MLM_Data,train_data,blockShuffleDataLoader maxlen=32 batch_size=128 vocab_file_dir = './bert_model/vocab.txt' tokenizer = BertTokenizer.from_pretrained(vocab_file_dir) config = BertConfig( vocab_size=len(tokenizer), hidden_size=768, num_hidden_layers=12, num_attention_heads=12, max_position_embeddings=512, ) # 把层数改为8层 model = BertForMaskedLM.from_pretrained('../../bert-base-chinese') model.resize_token_embeddings(len(tokenizer)) print(model) train_MLM_data=MLM_Data(train_data,maxlen,tokenizer) #自己定义dataloader,不要用huggingface的 dl=blockShuffleDataLoader(train_MLM_data,None,key=lambda x:len(x[0])+len(x[1]),shuffle=False ,batch_size=batch_size,collate_fn=train_MLM_data.collate) training_args = TrainingArguments( output_dir='./bert_output', overwrite_output_dir=True, num_train_epochs=400, per_device_train_batch_size=batch_size, save_steps=len(dl)*10000,#每10个epoch save一次 save_total_limit=3, logging_steps=len(dl),#每个epoch log一次 seed=2021, learning_rate=5e-5, lr_end=1e-5,#学习率衰减的终点 weight_decay=0.01, warmup_steps=int(450000*150/batch_size*0.03) ) trainer = Trainer( model=model, args=training_args, train_dataLoader=dl, prediction_loss_only=True, ) if __name__ == '__main__': trainer.train() trainer.save_model('./bert_model') ================================================ FILE: code/bert-base-count3/pretrain/transformers1/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. __version__ = "2.11.0" # Work around to update TensorFlow's absl.logging threshold which alters the # default Python logging output behavior when present. # see: https://github.com/abseil/abseil-py/issues/99 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493 try: import absl.logging except ImportError: pass else: absl.logging.set_verbosity("info") absl.logging.set_stderrthreshold("info") absl.logging._warn_preinit_stderr = False import logging # Configurations from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig from .configuration_bart import BartConfig from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig from .configuration_encoder_decoder import EncoderDecoderConfig from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig from .configuration_marian import MarianConfig from .configuration_mmbt import MMBTConfig from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig from .configuration_utils import PretrainedConfig from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig from .data import ( DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor, SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels, is_sklearn_available, squad_convert_examples_to_features, xnli_output_modes, xnli_processors, xnli_tasks_num_labels, ) # Files and general utilities from .file_utils import ( CONFIG_NAME, MODEL_CARD_NAME, PYTORCH_PRETRAINED_BERT_CACHE, PYTORCH_TRANSFORMERS_CACHE, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, TRANSFORMERS_CACHE, WEIGHTS_NAME, add_end_docstrings, add_start_docstrings, cached_path, is_tf_available, is_torch_available, ) from .hf_argparser import HfArgumentParser # Model Cards from .modelcard import ModelCard # TF 2.0 <=> PyTorch conversion utilities from .modeling_tf_pytorch_utils import ( convert_tf_weight_name_to_pt_weight_name, load_pytorch_checkpoint_in_tf2_model, load_pytorch_model_in_tf2_model, load_pytorch_weights_in_tf2_model, load_tf2_checkpoint_in_pytorch_model, load_tf2_model_in_pytorch_model, load_tf2_weights_in_pytorch_model, ) # Pipelines from .pipelines import ( CsvPipelineDataFormat, FeatureExtractionPipeline, FillMaskPipeline, JsonPipelineDataFormat, NerPipeline, PipedPipelineDataFormat, Pipeline, PipelineDataFormat, QuestionAnsweringPipeline, SummarizationPipeline, TextClassificationPipeline, TextGenerationPipeline, TokenClassificationPipeline, TranslationPipeline, pipeline, ) # Tokenizers from .tokenization_albert import AlbertTokenizer from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer from .tokenization_bart import BartTokenizer, MBartTokenizer from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer from .tokenization_camembert import CamembertTokenizer from .tokenization_ctrl import CTRLTokenizer from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast from .tokenization_flaubert import FlaubertTokenizer from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_reformer import ReformerTokenizer from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_t5 import T5Tokenizer from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast from .tokenization_utils import PreTrainedTokenizer from .tokenization_xlm import XLMTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer from .trainer_utils import EvalPrediction from .training_args import TrainingArguments from .training_args_tf import TFTrainingArguments logger = logging.getLogger(__name__) # pylint: disable=invalid-name if is_sklearn_available(): from .data import glue_compute_metrics, xnli_compute_metrics # Modeling if is_torch_available(): from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering, apply_chunking_to_forward from .modeling_auto import ( AutoModel, AutoModelForPreTraining, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelWithLMHead, AutoModelForTokenClassification, AutoModelForMultipleChoice, MODEL_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_MULTIPLE_CHOICE_MAPPING, ) from .modeling_bert import ( BertPreTrainedModel, BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering, load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_LIST, BertLayer, ) from .modeling_openai import ( OpenAIGPTPreTrainedModel, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_transfo_xl import ( TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, AdaptiveEmbedding, load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_gpt2 import ( GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_LIST from .modeling_xlnet import ( XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForTokenClassification, XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering, load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_xlm import ( XLMPreTrainedModel, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForTokenClassification, XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_bart import ( BartForSequenceClassification, BartModel, BartForConditionalGeneration, BART_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_marian import MarianMTModel from .tokenization_marian import MarianTokenizer from .modeling_roberta import ( RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification, RobertaForQuestionAnswering, ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_distilbert import ( DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_camembert import ( CamembertForMaskedLM, CamembertModel, CamembertForSequenceClassification, CamembertForMultipleChoice, CamembertForTokenClassification, CamembertForQuestionAnswering, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_encoder_decoder import EncoderDecoderModel from .modeling_t5 import ( T5PreTrainedModel, T5Model, T5ForConditionalGeneration, load_tf_weights_in_t5, T5_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_albert import ( AlbertPreTrainedModel, AlbertModel, AlbertForPreTraining, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering, AlbertForTokenClassification, load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_xlm_roberta import ( XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification from .modeling_flaubert import ( FlaubertModel, FlaubertWithLMHeadModel, FlaubertForSequenceClassification, FlaubertForQuestionAnswering, FlaubertForQuestionAnsweringSimple, FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_electra import ( ElectraForPreTraining, ElectraForMaskedLM, ElectraForTokenClassification, ElectraPreTrainedModel, ElectraForSequenceClassification, ElectraModel, load_tf_weights_in_electra, ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_reformer import ( ReformerAttention, ReformerLayer, ReformerModel, ReformerModelWithLMHead, REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_longformer import ( LongformerModel, LongformerForMaskedLM, LongformerForSequenceClassification, LongformerForMultipleChoice, LongformerForTokenClassification, LongformerForQuestionAnswering, LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, ) # Optimization from .optimization import ( AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup, ) # Trainer from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction from .data.data_collator import DefaultDataCollator, DataCollator, DataCollatorForLanguageModeling from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments # Benchmarks from .benchmark import PyTorchBenchmark, PyTorchBenchmarkArguments # TensorFlow if is_tf_available(): from .modeling_tf_utils import ( TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, tf_top_k_top_p_filtering, ) from .modeling_tf_auto import ( TFAutoModel, TFAutoModelForPreTraining, TFAutoModelForMultipleChoice, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_MODEL_MAPPING, TF_MODEL_FOR_PRETRAINING_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, ) from .modeling_tf_bert import ( TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings, TFBertModel, TFBertForPreTraining, TFBertForMaskedLM, TFBertForNextSentencePrediction, TFBertForSequenceClassification, TFBertForMultipleChoice, TFBertForTokenClassification, TFBertForQuestionAnswering, TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_gpt2 import ( TFGPT2PreTrainedModel, TFGPT2MainLayer, TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_openai import ( TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer, TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_transfo_xl import ( TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer, TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, TFAdaptiveEmbedding, ) from .modeling_tf_xlnet import ( TFXLNetPreTrainedModel, TFXLNetMainLayer, TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForTokenClassification, TFXLNetForQuestionAnsweringSimple, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_xlm import ( TFXLMPreTrainedModel, TFXLMMainLayer, TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_xlm_roberta import ( TFXLMRobertaForMaskedLM, TFXLMRobertaModel, TFXLMRobertaForSequenceClassification, TFXLMRobertaForTokenClassification, TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_roberta import ( TFRobertaPreTrainedModel, TFRobertaMainLayer, TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaForQuestionAnswering, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_camembert import ( TFCamembertModel, TFCamembertForMaskedLM, TFCamembertForSequenceClassification, TFCamembertForTokenClassification, TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_flaubert import ( TFFlaubertModel, TFFlaubertWithLMHeadModel, TFFlaubertForSequenceClassification, TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_distilbert import ( TFDistilBertPreTrainedModel, TFDistilBertMainLayer, TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TFDistilBertForQuestionAnswering, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_ctrl import ( TFCTRLPreTrainedModel, TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_albert import ( TFAlbertPreTrainedModel, TFAlbertMainLayer, TFAlbertModel, TFAlbertForPreTraining, TFAlbertForMaskedLM, TFAlbertForMultipleChoice, TFAlbertForSequenceClassification, TFAlbertForQuestionAnswering, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_t5 import ( TFT5PreTrainedModel, TFT5Model, TFT5ForConditionalGeneration, TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_electra import ( TFElectraPreTrainedModel, TFElectraModel, TFElectraForPreTraining, TFElectraForMaskedLM, TFElectraForTokenClassification, TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, ) # Optimization from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator # Trainer from .trainer_tf import TFTrainer if not is_tf_available() and not is_torch_available(): logger.warning( "Neither PyTorch nor TensorFlow >= 2.0 have been found." "Models won't be available and only tokenizers, configuration" "and file/data utilities can be used." ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/__main__.py ================================================ # coding: utf8 def main(): import sys if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: print( "This command line utility let you convert original (author released) model checkpoint to pytorch.\n" "It should be used as one of: \n" ">> transformers1 bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" ">> transformers1 gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" ">> transformers1 transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" ">> transformers1 gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" ">> transformers1 xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" ">> transformers1 xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") else: if sys.argv[1] == "bert": try: from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) != 5: # pylint: disable=line-too-long print("Should be used as `transformers1 bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") else: PYTORCH_DUMP_OUTPUT = sys.argv.pop() TF_CONFIG = sys.argv.pop() TF_CHECKPOINT = sys.argv.pop() convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) elif sys.argv[1] == "gpt": from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch if len(sys.argv) < 4 or len(sys.argv) > 5: # pylint: disable=line-too-long print("Should be used as `transformers1 gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") else: OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] PYTORCH_DUMP_OUTPUT = sys.argv[3] if len(sys.argv) == 5: OPENAI_GPT_CONFIG = sys.argv[4] else: OPENAI_GPT_CONFIG = "" convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, OPENAI_GPT_CONFIG, PYTORCH_DUMP_OUTPUT) elif sys.argv[1] == "transfo_xl": try: from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) < 4 or len(sys.argv) > 5: # pylint: disable=line-too-long print("Should be used as `transformers1 transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") else: if 'ckpt' in sys.argv[2].lower(): TF_CHECKPOINT = sys.argv[2] TF_DATASET_FILE = "" else: TF_DATASET_FILE = sys.argv[2] TF_CHECKPOINT = "" PYTORCH_DUMP_OUTPUT = sys.argv[3] if len(sys.argv) == 5: TF_CONFIG = sys.argv[4] else: TF_CONFIG = "" convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) elif sys.argv[1] == "gpt2": try: from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) < 4 or len(sys.argv) > 5: # pylint: disable=line-too-long print("Should be used as `transformers1 gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") else: TF_CHECKPOINT = sys.argv[2] PYTORCH_DUMP_OUTPUT = sys.argv[3] if len(sys.argv) == 5: TF_CONFIG = sys.argv[4] else: TF_CONFIG = "" convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) elif sys.argv[1] == "xlnet": try: from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) < 5 or len(sys.argv) > 6: # pylint: disable=line-too-long print("Should be used as `transformers1 xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`") else: TF_CHECKPOINT = sys.argv[2] TF_CONFIG = sys.argv[3] PYTORCH_DUMP_OUTPUT = sys.argv[4] if len(sys.argv) == 6: FINETUNING_TASK = sys.argv[5] else: FINETUNING_TASK = None convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, FINETUNING_TASK) elif sys.argv[1] == "xlm": from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch if len(sys.argv) != 4: # pylint: disable=line-too-long print("Should be used as `transformers1 xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`") else: XLM_CHECKPOINT_PATH = sys.argv[2] PYTORCH_DUMP_OUTPUT = sys.argv[3] convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT) if __name__ == '__main__': main() ================================================ FILE: code/bert-base-count3/pretrain/transformers1/activations.py ================================================ import logging import math import torch import torch.nn.functional as F logger = logging.getLogger(__name__) def swish(x): return x * torch.sigmoid(x) def _gelu_python(x): """ Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in torch.nn.functional Also see https://arxiv.org/abs/1606.08415 """ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) def gelu_new(x): """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see https://arxiv.org/abs/1606.08415 """ return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) if torch.__version__ < "1.4.0": gelu = _gelu_python else: gelu = F.gelu def gelu_fast(x): return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) ACT2FN = { "relu": F.relu, "swish": swish, "gelu": gelu, "tanh": torch.tanh, "gelu_new": gelu_new, "gelu_fast": gelu_fast, } def get_activation(activation_string): if activation_string in ACT2FN: return ACT2FN[activation_string] else: raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/another_try.py ================================================ from transformers import TFBertModel, BertTokenizer, BertConfig import tensorflow as tf config = BertConfig.from_pretrained("bert-base-cased", output_hidden_states=True) model = TFBertModel.from_pretrained("bert-base-cased", config=config) tok = BertTokenizer.from_pretrained("bert-base-cased") text = tok.encode("Ain't this [MASK] best thing you've ever seen?") inputs = tf.constant(text) outputs = model.predict(inputs) print(outputs) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/benchmark/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from ..file_utils import is_torch_available if is_torch_available(): from .benchmark_args import PyTorchBenchmarkArguments from .benchmark import PyTorchBenchmark ================================================ FILE: code/bert-base-count3/pretrain/transformers1/benchmark/benchmark.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Benchmarking the library on inference and training in PyTorch. """ import inspect import logging import timeit from transformers import MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, PretrainedConfig, is_torch_available from .benchmark_utils import Benchmark, Memory, start_memory_tracing, stop_memory_tracing if is_torch_available(): import torch from .benchmark_args import PyTorchBenchmarkArguments logger = logging.getLogger(__name__) class PyTorchBenchmark(Benchmark): args: PyTorchBenchmarkArguments configs: PretrainedConfig framework: str = "PyTorch" @property def framework_version(self): return torch.__version__ def train(self, model_name, batch_size, sequence_length, trace_memory=False): try: config = self.config_dict[model_name] model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config) model.to(self.args.device) model.train() input_ids = torch.randint( model.config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device ) def compute_loss_and_backprob(): # TODO: Not all models call labels argument labels => this hack using the function signature should be corrected once all models have a common name for labels function_argument_names = inspect.getfullargspec(model.forward).args if "labels" in function_argument_names: loss = model(input_ids, labels=input_ids)[0] elif "lm_labels" in function_argument_names: loss = model(input_ids, lm_labels=input_ids)[0] elif "masked_lm_labels" in function_argument_names: loss = model(input_ids, masked_lm_labels=input_ids)[0] else: NotImplementedError(f"{model_name} does not seem to allow training with labels") loss.backward() model.zero_grad() if trace_memory is True: if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: trace = start_memory_tracing("transformers1") else: # clear cuda cache torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() # calculate loss and do backpropagation compute_loss_and_backprob() if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: summary = stop_memory_tracing(trace) memory = summary.total else: memory = Memory(torch.cuda.max_memory_reserved()) return memory else: # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average runtimes = timeit.repeat(lambda: compute_loss_and_backprob(), repeat=self.args.repeat, number=10,) return min(runtimes) / 10.0 except RuntimeError as e: self.print_fn("Doesn't fit on GPU. {}".format(e)) return "N/A" def inference(self, model_name, batch_size, sequence_length, trace_memory=False): try: config = self.config_dict[model_name] model = MODEL_MAPPING[config.__class__](config) model.to(self.args.device) model.eval() input_ids = torch.randint( config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device ) if trace_memory is True: if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: trace = start_memory_tracing("transformers1") else: # clear cuda cache torch.cuda.empty_cache() if hasattr(torch.cuda, "max_memory_reserved"): torch.cuda.reset_peak_memory_stats() else: logger.info( "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage" ) torch.cuda.reset_max_memory_cached() model(input_ids) if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: summary = stop_memory_tracing(trace) memory = summary.total else: if hasattr(torch.cuda, "max_memory_reserved"): memory = Memory(torch.cuda.max_memory_reserved()) else: logger.info( "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage" ) memory = Memory(torch.cuda.max_memory_cached()) return memory else: # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average runtimes = timeit.repeat(lambda: model(input_ids), repeat=self.args.repeat, number=10,) return min(runtimes) / 10.0 except RuntimeError as e: self.print_fn("Doesn't fit on GPU. {}".format(e)) return "N/A" ================================================ FILE: code/bert-base-count3/pretrain/transformers1/benchmark/benchmark_args.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from dataclasses import dataclass, field from typing import Tuple from ..file_utils import cached_property, is_torch_available, torch_required from .benchmark_args_utils import BenchmarkArguments if is_torch_available(): import torch try: import torch_xla.core.xla_model as xm _has_tpu = True except ImportError: _has_tpu = False @torch_required def is_tpu_available(): return _has_tpu logger = logging.getLogger(__name__) @dataclass class PyTorchBenchmarkArguments(BenchmarkArguments): no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"}) torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"}) fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."}) @cached_property @torch_required def _setup_devices(self) -> Tuple["torch.device", int]: logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") n_gpu = 0 elif is_tpu_available(): device = xm.xla_device() n_gpu = 0 else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() return device, n_gpu @property @torch_required def device_idx(self) -> int: return torch.cuda.current_device() @property @torch_required def device(self) -> "torch.device": return self._setup_devices[0] @property @torch_required def n_gpu(self): return self._setup_devices[1] ================================================ FILE: code/bert-base-count3/pretrain/transformers1/benchmark/benchmark_args_utils.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import dataclasses import json from dataclasses import dataclass, field from time import time from typing import List def list_field(default=None, metadata=None): return field(default_factory=lambda: default, metadata=metadata) @dataclass class BenchmarkArguments: """ BenchMarkArguments are arguments we use in our benchmark scripts **which relate to the training loop itself**. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ models: List[str] = list_field( default=[], metadata={ "help": "Model checkpoints to be provided to the AutoModel classes. Leave blank to benchmark the base version of all available models" }, ) batch_sizes: List[int] = list_field( default=[8], metadata={"help": "List of batch sizes for which memory and time performance will be evaluated"} ) sequence_lengths: List[int] = list_field( default=[8, 32, 128, 512], metadata={"help": "List of sequence lengths for which memory and time performance will be evaluated"}, ) no_inference: bool = field(default=False, metadata={"help": "Don't benchmark inference of model"}) training: bool = field(default=False, metadata={"help": "Benchmark training of model"}) verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"}) no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"}) no_memory: bool = field(default=False, metadata={"help": "Don't perform memory measurments"}) trace_memory_line_by_line: bool = field(default=False, metadata={"help": "Trace memory line by line"}) save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"}) log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"}) no_env_print: bool = field(default=False, metadata={"help": "Don't print environment information"}) inference_time_csv_file: str = field( default=f"inference_time_{round(time())}.csv", metadata={"help": "CSV filename used if saving time results to csv."}, ) inference_memory_csv_file: str = field( default=f"inference_memory_{round(time())}.csv", metadata={"help": "CSV filename used if saving memory results to csv."}, ) train_time_csv_file: str = field( default=f"train_time_{round(time())}.csv", metadata={"help": "CSV filename used if saving time results to csv for training."}, ) train_memory_csv_file: str = field( default=f"train_memory_{round(time())}.csv", metadata={"help": "CSV filename used if saving memory results to csv for training."}, ) env_info_csv_file: str = field( default=f"env_info_{round(time())}.csv", metadata={"help": "CSV filename used if saving environment information."}, ) log_filename: str = field( default=f"log_{round(time())}.csv", metadata={"help": "Log filename used if print statements are saved in log."}, ) repeat: int = field(default=3, metadata={"help": "Times an experiment will be run."}) def to_json_string(self): """ Serializes this instance to a JSON string. """ return json.dumps(dataclasses.asdict(self), indent=2) @property def model_names(self): return self.models ================================================ FILE: code/bert-base-count3/pretrain/transformers1/benchmark/benchmark_utils.py ================================================ """ Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp Copyright by the AllenNLP authors. """ import copy import csv import linecache import logging import os import platform import sys from abc import ABC, abstractmethod from collections import defaultdict, namedtuple from datetime import datetime from typing import Iterable, List, NamedTuple, Optional, Union from transformers import AutoConfig, PretrainedConfig from transformers import __version__ as version from ..file_utils import is_tf_available, is_torch_available from .benchmark_args_utils import BenchmarkArguments if is_torch_available(): from torch.cuda import empty_cache as torch_empty_cache if is_tf_available(): from tensorflow.python.eager import context as tf_context logger = logging.getLogger(__name__) # pylint: disable=invalid-name _is_memory_tracing_enabled = False BenchmarkOutput = namedtuple( "BenchmarkOutput", ["time_inference_result", "memory_inference_result", "time_train_result", "memory_train_result"] ) def is_memory_tracing_enabled(): global _is_memory_tracing_enabled return _is_memory_tracing_enabled class Frame(NamedTuple): """ `Frame` is a NamedTuple used to gather the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ filename: str module: str line_number: int event: str line_text: str class UsedMemoryState(NamedTuple): """ `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) """ frame: Frame cpu_memory: int gpu_memory: int class Memory(NamedTuple): """ `Memory` NamedTuple have a single field `bytes` and you can get a human readable str of the number of mega bytes by calling `__repr__` - `byte` (integer): number of bytes, """ bytes: int def __repr__(self) -> str: return str(bytes_to_mega_bytes(self.bytes)) class MemoryState(NamedTuple): """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ frame: Frame cpu: Memory gpu: Memory cpu_gpu: Memory class MemorySummary(NamedTuple): """ `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). """ sequential: List[MemoryState] cumulative: List[MemoryState] current: List[MemoryState] total: Memory MemoryTrace = List[UsedMemoryState] def start_memory_tracing( modules_to_trace: Optional[Union[str, Iterable[str]]] = None, modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None, events_to_trace: str = "line", gpus_to_trace: Optional[List[int]] = None, ) -> MemoryTrace: """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. See `../../examples/benchmarks.py for a usage example. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident Set Size” (the non-swapped physical memory the process is using). See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info Args: - `modules_to_trace`: (None, string, list/tuple of string) if None, all events are recorded if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers1.modeling_gpt2') - `modules_not_to_trace`: (None, string, list/tuple of string) if None, no module is avoided if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch') - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events) default to line - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs Return: - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script). - `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log CPU memory usage. " "Install psutil (pip install psutil) to use CPU memory tracing." ) process = None else: process = psutil.Process(os.getpid()) try: from py3nvml import py3nvml py3nvml.nvmlInit() devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace py3nvml.nvmlShutdown() except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to use GPU memory tracing." ) log_gpu = False except (OSError, py3nvml.NVMLError): logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.") log_gpu = False else: log_gpu = is_torch_available() or is_tf_available() memory_trace = [] def traceit(frame, event, args): """ Tracing method executed before running each line in a module or sub-module Record memory allocated in a list with debugging information """ global _is_memory_tracing_enabled if not _is_memory_tracing_enabled: return traceit # Filter events if events_to_trace is not None: if isinstance(events_to_trace, str) and event != events_to_trace: return traceit elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace: return traceit # Filter modules name = frame.f_globals["__name__"] if not isinstance(name, str): return traceit else: # Filter whitelist of modules to trace if modules_to_trace is not None: if isinstance(modules_to_trace, str) and modules_to_trace not in name: return traceit elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace): return traceit # Filter blacklist of modules not to trace if modules_not_to_trace is not None: if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name: return traceit elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace): return traceit # Record current tracing state (file, location in file...) lineno = frame.f_lineno filename = frame.f_globals["__file__"] if filename.endswith(".pyc") or filename.endswith(".pyo"): filename = filename[:-1] line = linecache.getline(filename, lineno).rstrip() traced_state = Frame(filename, name, lineno, event, line) # Record current memory state (rss memory) and compute difference with previous memory state cpu_mem = 0 if process is not None: mem = process.memory_info() cpu_mem = mem.rss gpu_mem = 0 if log_gpu: # Clear GPU caches if is_torch_available(): torch_empty_cache() if is_tf_available(): tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802 # Sum used memory for all GPUs py3nvml.nvmlInit() for i in devices: handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used py3nvml.nvmlShutdown() mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem) memory_trace.append(mem_state) return traceit sys.settrace(traceit) global _is_memory_tracing_enabled _is_memory_tracing_enabled = True return memory_trace def stop_memory_tracing( memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True ) -> Optional[MemorySummary]: """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given. Args: - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory Return: - None if `memory_trace` is None - `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). `Memory` named tuple have fields - `byte` (integer): number of bytes, - `string` (string): same as human readable string (ex: "3.5MB") `Frame` are namedtuple used to list the current frame state and have the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ global _is_memory_tracing_enabled _is_memory_tracing_enabled = False if memory_trace is not None and len(memory_trace) > 1: memory_diff_trace = [] memory_curr_trace = [] cumulative_memory_dict = defaultdict(lambda: [0, 0, 0]) for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip( memory_trace[:-1], memory_trace[1:] ): cpu_mem_inc = next_cpu_mem - cpu_mem gpu_mem_inc = next_gpu_mem - gpu_mem cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc memory_diff_trace.append( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) ) memory_curr_trace.append( MemoryState( frame=frame, cpu=Memory(next_cpu_mem), gpu=Memory(next_gpu_mem), cpu_gpu=Memory(next_gpu_mem + next_cpu_mem), ) ) cumulative_memory_dict[frame][0] += cpu_mem_inc cumulative_memory_dict[frame][1] += gpu_mem_inc cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc cumulative_memory = sorted( list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True ) # order by the total CPU + GPU memory increase cumulative_memory = list( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory ) memory_curr_trace = sorted(memory_curr_trace, key=lambda x: x.cpu_gpu.bytes, reverse=True) if ignore_released_memory: total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace) else: total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace) total_memory = Memory(total_memory) return MemorySummary( sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory, ) return None def bytes_to_mega_bytes(memory_amount: int) -> int: """ Utility to convert a number of bytes (int) into a number of mega bytes (int) """ return memory_amount >> 20 class Benchmark(ABC): """ Benchmarks is a simple but feature-complete benchmarking script to compare memory and time performance of models in Transformers. """ args: BenchmarkArguments configs: PretrainedConfig framework: str def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig = None): self.args = args if configs is None: self.config_dict = { model_name: AutoConfig.from_pretrained(model_name) for model_name in self.args.model_names } else: self.config_dict = {model_name: config for model_name, config in zip(self.args.model_names, configs)} self._print_fn = None self._framework_version = None self._environment_info = None @property def print_fn(self): if self._print_fn is None: if self.args.log_print: logging.basicConfig( level=logging.DEBUG, filename=self.args.log_filename, filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s", ) def print_and_log(*args): logging.info(*args) print(*args) self._print_fn = print_and_log else: self._print_fn = print return self._print_fn @property def is_gpu(self): return self.args.n_gpu > 0 @property @abstractmethod def framework_version(self): pass @abstractmethod def train(self, model_name, batch_size, sequence_length): pass @abstractmethod def inference(self, model_name, batch_size, sequence_length): pass def run(self): result_dict = {model_name: {} for model_name in self.args.model_names} inference_result_time = copy.deepcopy(result_dict) inference_result_memory = copy.deepcopy(result_dict) train_result_time = copy.deepcopy(result_dict) train_result_memory = copy.deepcopy(result_dict) for c, model_name in enumerate(self.args.model_names): self.print_fn(f"{c + 1} / {len(self.args.model_names)}") model_dict = { "bs": self.args.batch_sizes, "ss": self.args.sequence_lengths, "result": {i: {} for i in self.args.batch_sizes}, } inference_result_time[model_name] = copy.deepcopy(model_dict) inference_result_memory[model_name] = copy.deepcopy(model_dict) train_result_time[model_name] = copy.deepcopy(model_dict) train_result_memory[model_name] = copy.deepcopy(model_dict) for batch_size in self.args.batch_sizes: for sequence_length in self.args.sequence_lengths: if not self.args.no_inference: if not self.args.no_memory: memory = self.inference(model_name, batch_size, sequence_length, trace_memory=True) inference_result_memory[model_name]["result"][batch_size][sequence_length] = memory if not self.args.no_speed: time = self.inference(model_name, batch_size, sequence_length, trace_memory=False) inference_result_time[model_name]["result"][batch_size][sequence_length] = time if self.args.training: if not self.args.no_memory: memory = self.train(model_name, batch_size, sequence_length, trace_memory=True) train_result_memory[model_name]["result"][batch_size][sequence_length] = memory if not self.args.no_speed: time = self.inference(model_name, batch_size, sequence_length, trace_memory=False) train_result_time[model_name]["result"][batch_size][sequence_length] = time if not self.args.no_inference: if not self.args.no_speed: self.print_fn("======= INFERENCE - SPEED - RESULT =======") self.print_results(inference_result_time) self.save_to_csv(inference_result_time, self.args.inference_time_csv_file) if not self.args.no_memory: self.print_fn("======= INFERENCE - MEMORY - RESULT =======") self.print_results(inference_result_memory) self.save_to_csv(inference_result_memory, self.args.inference_memory_csv_file) if self.args.training: if not self.args.no_speed: self.print_fn("======= TRAIN - SPEED - RESULT =======") self.print_results(train_result_time) self.save_to_csv(train_result_time, self.args.train_time_csv_file) if not self.args.no_memory: self.print_fn("======= TRAIN - MEMORY - RESULT =======") self.print_results(train_result_memory) self.save_to_csv(train_result_memory, self.args.train_memory_csv_file) if not self.args.no_env_print: self.print_fn("\n======== ENVIRONMENT - INFORMATION ========") self.print_fn( "\n".join(["- {}: {}".format(prop, val) for prop, val in self.environment_info.items()]) + "\n" ) if self.args.save_to_csv: with open(self.args.env_info_csv_file, mode="w", newline="") as csv_file: writer = csv.writer(csv_file) for key, value in self.environment_info.items(): writer.writerow([key, value]) return BenchmarkOutput(inference_result_time, inference_result_memory, train_result_time, train_result_memory) @property def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" else: info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total) info["use_gpu"] = self.is_gpu if self.is_gpu: info["num_gpus"] = self.args.n_gpu try: from py3nvml import py3nvml py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx) except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" except (OSError, py3nvml.NVMLError): logger.warning( "Error while initializing comunication with GPU. " "We won't log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" py3nvml.nvmlShutdown() else: info["gpu"] = py3nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes(py3nvml.nvmlDeviceGetMemoryInfo(handle).total) info["gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000 info["gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState(handle) py3nvml.nvmlShutdown() self._environment_info = info return self._environment_info def print_results(self, result_dict): for model_name in self.args.model_names: self.print_fn("\t" + f"======= MODEL CHECKPOINT: {model_name} =======") for batch_size in result_dict[model_name]["bs"]: for sequence_length in result_dict[model_name]["ss"]: result = result_dict[model_name]["result"][batch_size][sequence_length] if isinstance(result, float): self.print_fn( f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{(round(1000 * result) / 1000)}s" ) else: self.print_fn(f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{result} MB") def print_memory_trace_statistics(self, summary: MemorySummary): self.print_fn( "\nLine by line memory consumption:\n" + "\n".join( f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}" for state in summary.sequential ) ) self.print_fn( "\nLines with top memory consumption:\n" + "\n".join( f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}" for state in summary.cumulative[:6] ) ) self.print_fn( "\nLines with lowest memory consumption:\n" + "\n".join( f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}" for state in summary.cumulative[-6:] ) ) self.print_fn(f"\nTotal memory increase: {summary.total}") def save_to_csv(self, result_dict, filename): if not self.args.save_to_csv: return self.print_fn("Saving results to csv.") with open(filename, mode="w") as csv_file: assert len(self.args.model_names) > 0, "At least 1 model should be defined, but got {}".format( self.model_names ) fieldnames = ["model", "batch_size", "sequence_length"] writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["result"]) writer.writeheader() for model_name in self.args.model_names: result_dict_model = result_dict[model_name]["result"] for bs in result_dict_model: for ss in result_dict_model[bs]: result_model = result_dict_model[bs][ss] writer.writerow( { "model": model_name, "batch_size": bs, "sequence_length": ss, "result": ("{}" if not isinstance(result_model, float) else "{:.4f}").format( result_model ), } ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/benchmark_utils.py ================================================ """ Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp Copyright by the AllenNLP authors. """ import linecache import logging import os import sys from collections import defaultdict from typing import Iterable, List, NamedTuple, Optional, Union from .file_utils import is_tf_available, is_torch_available if is_torch_available(): from torch.cuda import empty_cache as torch_empty_cache if is_tf_available(): from tensorflow.python.eager import context as tf_context logger = logging.getLogger(__name__) # pylint: disable=invalid-name _is_memory_tracing_enabled = False def is_memory_tracing_enabled(): global _is_memory_tracing_enabled return _is_memory_tracing_enabled class Frame(NamedTuple): """ `Frame` is a NamedTuple used to gather the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ filename: str module: str line_number: int event: str line_text: str class UsedMemoryState(NamedTuple): """ `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) """ frame: Frame cpu_memory: int gpu_memory: int class Memory(NamedTuple): """ `Memory` NamedTuple have a single field `bytes` and you can get a human readable string of the number of bytes by calling `__repr__` - `byte` (integer): number of bytes, """ bytes: int def __repr__(self) -> str: return bytes_to_human_readable(self.bytes) class MemoryState(NamedTuple): """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ frame: Frame cpu: Memory gpu: Memory cpu_gpu: Memory class MemorySummary(NamedTuple): """ `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). """ sequential: List[MemoryState] cumulative: List[MemoryState] total: Memory MemoryTrace = List[UsedMemoryState] def start_memory_tracing( modules_to_trace: Optional[Union[str, Iterable[str]]] = None, modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None, events_to_trace: str = "line", gpus_to_trace: Optional[List[int]] = None, ) -> MemoryTrace: """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. See `../../examples/benchmarks.py for a usage example. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident Set Size” (the non-swapped physical memory the process is using). See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info Args: - `modules_to_trace`: (None, string, list/tuple of string) if None, all events are recorded if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers1.modeling_gpt2') - `modules_not_to_trace`: (None, string, list/tuple of string) if None, no module is avoided if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch') - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events) default to line - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs Return: - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script). - `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log CPU memory usage. " "Install psutil (pip install psutil) to use CPU memory tracing." ) process = None else: process = psutil.Process(os.getpid()) try: from py3nvml import py3nvml py3nvml.nvmlInit() devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace py3nvml.nvmlShutdown() except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to use GPU memory tracing." ) log_gpu = False except (OSError, py3nvml.NVMLError): logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.") log_gpu = False else: log_gpu = is_torch_available() or is_tf_available() memory_trace = [] def traceit(frame, event, args): """ Tracing method executed before running each line in a module or sub-module Record memory allocated in a list with debugging information """ global _is_memory_tracing_enabled if not _is_memory_tracing_enabled: return traceit # Filter events if events_to_trace is not None: if isinstance(events_to_trace, str) and event != events_to_trace: return traceit elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace: return traceit # Filter modules name = frame.f_globals["__name__"] if not isinstance(name, str): return traceit else: # Filter whitelist of modules to trace if modules_to_trace is not None: if isinstance(modules_to_trace, str) and modules_to_trace not in name: return traceit elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace): return traceit # Filter blacklist of modules not to trace if modules_not_to_trace is not None: if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name: return traceit elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace): return traceit # Record current tracing state (file, location in file...) lineno = frame.f_lineno filename = frame.f_globals["__file__"] if filename.endswith(".pyc") or filename.endswith(".pyo"): filename = filename[:-1] line = linecache.getline(filename, lineno).rstrip() traced_state = Frame(filename, name, lineno, event, line) # Record current memory state (rss memory) and compute difference with previous memory state cpu_mem = 0 if process is not None: mem = process.memory_info() cpu_mem = mem.rss gpu_mem = 0 if log_gpu: # Clear GPU caches if is_torch_available(): torch_empty_cache() if is_tf_available(): tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802 # Sum used memory for all GPUs py3nvml.nvmlInit() for i in devices: handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used py3nvml.nvmlShutdown() mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem) memory_trace.append(mem_state) return traceit sys.settrace(traceit) global _is_memory_tracing_enabled _is_memory_tracing_enabled = True return memory_trace def stop_memory_tracing( memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True ) -> Optional[MemorySummary]: """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given. Args: - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory Return: - None if `memory_trace` is None - `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). `Memory` named tuple have fields - `byte` (integer): number of bytes, - `string` (string): same as human readable string (ex: "3.5MB") `Frame` are namedtuple used to list the current frame state and have the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ global _is_memory_tracing_enabled _is_memory_tracing_enabled = False if memory_trace is not None and len(memory_trace) > 1: memory_diff_trace = [] cumulative_memory_dict = defaultdict(lambda: [0, 0, 0]) for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip( memory_trace[:-1], memory_trace[1:] ): cpu_mem_inc = next_cpu_mem - cpu_mem gpu_mem_inc = next_gpu_mem - gpu_mem cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc memory_diff_trace.append( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) ) cumulative_memory_dict[frame][0] += cpu_mem_inc cumulative_memory_dict[frame][1] += gpu_mem_inc cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc cumulative_memory = sorted( list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True ) # order by the total CPU + GPU memory increase cumulative_memory = list( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory ) if ignore_released_memory: total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace) else: total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace) total_memory = Memory(total_memory) return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory) return None def bytes_to_human_readable(memory_amount: int) -> str: """ Utility to convert a number of bytes (int) in a human readable string (with units) """ for unit in ["B", "KB", "MB", "GB"]: if memory_amount > -1024.0 and memory_amount < 1024.0: return "{:.3f}{}".format(memory_amount, unit) memory_amount /= 1024.0 return "{:.3f}TB".format(memory_amount) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/__init__.py ================================================ from abc import ABC, abstractmethod from argparse import ArgumentParser class BaseTransformersCLICommand(ABC): @staticmethod @abstractmethod def register_subcommand(parser: ArgumentParser): raise NotImplementedError() @abstractmethod def run(self): raise NotImplementedError() ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/convert.py ================================================ from argparse import ArgumentParser, Namespace from logging import getLogger from transformers.commands import BaseTransformersCLICommand def convert_command_factory(args: Namespace): """ Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint. :return: ServeCommand """ return ConvertCommand( args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name ) class ConvertCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): """ Register this command to argparse so it's available for the transformer-cli :param parser: Root parser to register command-specific arguments :return: """ train_parser = parser.add_parser( "convert", help="CLI tool to run convert model from original " "author checkpoints to Transformers PyTorch checkpoints.", ) train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.") train_parser.add_argument( "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder." ) train_parser.add_argument( "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output." ) train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.") train_parser.add_argument( "--finetuning_task_name", type=str, default=None, help="Optional fine-tuning task name if the TF model was a finetuned model.", ) train_parser.set_defaults(func=convert_command_factory) def __init__( self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str, config: str, finetuning_task_name: str, *args ): self._logger = getLogger("transformers1-cli/converting") self._logger.info("Loading model {}".format(model_type)) self._model_type = model_type self._tf_checkpoint = tf_checkpoint self._pytorch_dump_output = pytorch_dump_output self._config = config self._finetuning_task_name = finetuning_task_name def run(self): if self._model_type == "albert": try: from transformers.convert_albert_original_tf_checkpoint_to_pytorch import ( convert_tf_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "bert": try: from transformers.convert_bert_original_tf_checkpoint_to_pytorch import ( convert_tf_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "gpt": from transformers.convert_openai_original_tf_checkpoint_to_pytorch import ( convert_openai_checkpoint_to_pytorch, ) convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "transfo_xl": try: from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import ( convert_transfo_xl_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) if "ckpt" in self._tf_checkpoint.lower(): TF_CHECKPOINT = self._tf_checkpoint TF_DATASET_FILE = "" else: TF_DATASET_FILE = self._tf_checkpoint TF_CHECKPOINT = "" convert_transfo_xl_checkpoint_to_pytorch( TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE ) elif self._model_type == "gpt2": try: from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import ( convert_gpt2_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "xlnet": try: from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import ( convert_xlnet_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_xlnet_checkpoint_to_pytorch( self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name ) elif self._model_type == "xlm": from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import ( convert_xlm_checkpoint_to_pytorch, ) convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output) else: raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]") ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/download.py ================================================ from argparse import ArgumentParser from transformers.commands import BaseTransformersCLICommand def download_command_factory(args): return DownloadCommand(args.model, args.cache_dir, args.force) class DownloadCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): download_parser = parser.add_parser("download") download_parser.add_argument( "--cache-dir", type=str, default=None, help="Path to location to store the models" ) download_parser.add_argument( "--force", action="store_true", help="Force the model to be download even if already in cache-dir" ) download_parser.add_argument("model", type=str, help="Name of the model to download") download_parser.set_defaults(func=download_command_factory) def __init__(self, model: str, cache: str, force: bool): self._model = model self._cache = cache self._force = force def run(self): from transformers import AutoModel, AutoTokenizer AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/env.py ================================================ import platform from argparse import ArgumentParser from transformers import __version__ as version from transformers import is_tf_available, is_torch_available from transformers.commands import BaseTransformersCLICommand def info_command_factory(_): return EnvironmentCommand() class EnvironmentCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): download_parser = parser.add_parser("env") download_parser.set_defaults(func=info_command_factory) def run(self): pt_version = "not installed" pt_cuda_available = "NA" if is_torch_available(): import torch pt_version = torch.__version__ pt_cuda_available = torch.cuda.is_available() tf_version = "not installed" tf_cuda_available = "NA" if is_tf_available(): import tensorflow as tf tf_version = tf.__version__ try: # deprecated in v2.1 tf_cuda_available = tf.test.is_gpu_available() except AttributeError: # returns list of devices, convert to bool tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) info = { "`transformers1` version": version, "Platform": platform.platform(), "Python version": platform.python_version(), "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), "Using GPU in script?": "", "Using distributed or parallel set-up in script?": "", } print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") print(self.format_dict(info)) return info @staticmethod def format_dict(d): return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/run.py ================================================ import logging from argparse import ArgumentParser from transformers.commands import BaseTransformersCLICommand from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline logger = logging.getLogger(__name__) # pylint: disable=invalid-name def try_infer_format_from_ext(path: str): if not path: return "pipe" for ext in PipelineDataFormat.SUPPORTED_FORMATS: if path.endswith(ext): return ext raise Exception( "Unable to determine file format from file extension {}. " "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS) ) def run_command_factory(args): nlp = pipeline( task=args.task, model=args.model if args.model else None, config=args.config, tokenizer=args.tokenizer, device=args.device, ) format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format reader = PipelineDataFormat.from_str( format=format, output_path=args.output, input_path=args.input, column=args.column if args.column else nlp.default_input_names, overwrite=args.overwrite, ) return RunCommand(nlp, reader) class RunCommand(BaseTransformersCLICommand): def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): self._nlp = nlp self._reader = reader @staticmethod def register_subcommand(parser: ArgumentParser): run_parser = parser.add_parser("run", help="Run a pipeline through the CLI") run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run") run_parser.add_argument("--input", type=str, help="Path to the file to use for inference") run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.") run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.") run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.") run_parser.add_argument( "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)" ) run_parser.add_argument( "--column", type=str, help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)", ) run_parser.add_argument( "--format", type=str, default="infer", choices=PipelineDataFormat.SUPPORTED_FORMATS, help="Input format to read from", ) run_parser.add_argument( "--device", type=int, default=-1, help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", ) run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.") run_parser.set_defaults(func=run_command_factory) def run(self): nlp, outputs = self._nlp, [] for entry in self._reader: output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) if isinstance(output, dict): outputs.append(output) else: outputs += output # Saving data if self._nlp.binary_output: binary_path = self._reader.save_binary(outputs) logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path)) else: self._reader.save(outputs) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/serving.py ================================================ import logging from argparse import ArgumentParser, Namespace from typing import Any, List, Optional from transformers import Pipeline from transformers.commands import BaseTransformersCLICommand from transformers.pipelines import SUPPORTED_TASKS, pipeline try: from uvicorn import run from fastapi import FastAPI, HTTPException, Body from fastapi.routing import APIRoute from pydantic import BaseModel from starlette.responses import JSONResponse _serve_dependencies_installed = True except (ImportError, AttributeError): BaseModel = object def Body(*x, **y): pass _serve_dependencies_installed = False logger = logging.getLogger("transformers1-cli/serving") def serve_command_factory(args: Namespace): """ Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ nlp = pipeline( task=args.task, model=args.model if args.model else None, config=args.config, tokenizer=args.tokenizer, device=args.device, ) return ServeCommand(nlp, args.host, args.port, args.workers) class ServeModelInfoResult(BaseModel): """ Expose model information """ infos: dict class ServeTokenizeResult(BaseModel): """ Tokenize result model """ tokens: List[str] tokens_ids: Optional[List[int]] class ServeDeTokenizeResult(BaseModel): """ DeTokenize result model """ text: str class ServeForwardResult(BaseModel): """ Forward result model """ output: Any class ServeCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): """ Register this command to argparse so it's available for the transformer-cli :param parser: Root parser to register command-specific arguments :return: """ serve_parser = parser.add_parser( "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints." ) serve_parser.add_argument( "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on" ) serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.") serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.") serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers") serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.") serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.") serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.") serve_parser.add_argument( "--device", type=int, default=-1, help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", ) serve_parser.set_defaults(func=serve_command_factory) def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int): self._pipeline = pipeline self.host = host self.port = port self.workers = workers if not _serve_dependencies_installed: raise RuntimeError( "Using serve command requires FastAPI and unicorn. " 'Please install transformers1 with [serving]: pip install "transformers1[serving]".' "Or install FastAPI and unicorn separately." ) else: logger.info("Serving model over {}:{}".format(host, port)) self._app = FastAPI( routes=[ APIRoute( "/", self.model_info, response_model=ServeModelInfoResult, response_class=JSONResponse, methods=["GET"], ), APIRoute( "/tokenize", self.tokenize, response_model=ServeTokenizeResult, response_class=JSONResponse, methods=["POST"], ), APIRoute( "/detokenize", self.detokenize, response_model=ServeDeTokenizeResult, response_class=JSONResponse, methods=["POST"], ), APIRoute( "/forward", self.forward, response_model=ServeForwardResult, response_class=JSONResponse, methods=["POST"], ), ], timeout=600, ) def run(self): run(self._app, host=self.host, port=self.port, workers=self.workers) def model_info(self): return ServeModelInfoResult(infos=vars(self._pipeline.model.config)) def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)): """ Tokenize the provided input and eventually returns corresponding tokens id: - **text_input**: String to tokenize - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping. """ try: tokens_txt = self._pipeline.tokenizer.tokenize(text_input) if return_ids: tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt) return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids) else: return ServeTokenizeResult(tokens=tokens_txt) except Exception as e: raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) def detokenize( self, tokens_ids: List[int] = Body(None, embed=True), skip_special_tokens: bool = Body(False, embed=True), cleanup_tokenization_spaces: bool = Body(True, embed=True), ): """ Detokenize the provided tokens ids to readable text: - **tokens_ids**: List of tokens ids - **skip_special_tokens**: Flag indicating to not try to decode special tokens - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones. """ try: decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces) return ServeDeTokenizeResult(model="", text=decoded_str) except Exception as e: raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) async def forward(self, inputs=Body(None, embed=True)): """ **inputs**: **attention_mask**: **tokens_type_ids**: """ # Check we don't have empty string if len(inputs) == 0: return ServeForwardResult(output=[], attention=[]) try: # Forward through the model output = self._pipeline(inputs) return ServeForwardResult(output=output) except Exception as e: raise HTTPException(500, {"error": str(e)}) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/train.py ================================================ import os from argparse import ArgumentParser, Namespace from logging import getLogger from transformers import SingleSentenceClassificationProcessor as Processor from transformers import TextClassificationPipeline, is_tf_available, is_torch_available from transformers.commands import BaseTransformersCLICommand if not is_tf_available() and not is_torch_available(): raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") # TF training parameters USE_XLA = False USE_AMP = False def train_command_factory(args: Namespace): """ Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ return TrainCommand(args) class TrainCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): """ Register this command to argparse so it's available for the transformer-cli :param parser: Root parser to register command-specific arguments :return: """ train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.") train_parser.add_argument( "--train_data", type=str, required=True, help="path to train (and optionally evaluation) dataset as a csv with " "tab separated labels and sentences.", ) train_parser.add_argument( "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels." ) train_parser.add_argument( "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts." ) train_parser.add_argument( "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids." ) train_parser.add_argument( "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)." ) train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.") train_parser.add_argument( "--validation_split", type=float, default=0.1, help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.", ) train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.") train_parser.add_argument( "--task", type=str, default="text_classification", help="Task to train the model on." ) train_parser.add_argument( "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model." ) train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.") train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.") train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.") train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.") train_parser.set_defaults(func=train_command_factory) def __init__(self, args: Namespace): self.logger = getLogger("transformers1-cli/training") self.framework = "tf" if is_tf_available() else "torch" os.makedirs(args.output, exist_ok=True) assert os.path.isdir(args.output) self.output = args.output self.column_label = args.column_label self.column_text = args.column_text self.column_id = args.column_id self.logger.info("Loading {} pipeline for {}".format(args.task, args.model)) if args.task == "text_classification": self.pipeline = TextClassificationPipeline.from_pretrained(args.model) elif args.task == "token_classification": raise NotImplementedError elif args.task == "question_answering": raise NotImplementedError self.logger.info("Loading dataset from {}".format(args.train_data)) self.train_dataset = Processor.create_from_csv( args.train_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.valid_dataset = None if args.validation_data: self.logger.info("Loading validation dataset from {}".format(args.validation_data)) self.valid_dataset = Processor.create_from_csv( args.validation_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.validation_split = args.validation_split self.train_batch_size = args.train_batch_size self.valid_batch_size = args.valid_batch_size self.learning_rate = args.learning_rate self.adam_epsilon = args.adam_epsilon def run(self): if self.framework == "tf": return self.run_tf() return self.run_torch() def run_torch(self): raise NotImplementedError def run_tf(self): self.pipeline.fit( self.train_dataset, validation_data=self.valid_dataset, validation_split=self.validation_split, learning_rate=self.learning_rate, adam_epsilon=self.adam_epsilon, train_batch_size=self.train_batch_size, valid_batch_size=self.valid_batch_size, ) # Save trained pipeline self.pipeline.save_pretrained(self.output) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/transformers_cli.py ================================================ #!/usr/bin/env python from argparse import ArgumentParser from transformers.commands.convert import ConvertCommand from transformers.commands.download import DownloadCommand from transformers.commands.env import EnvironmentCommand from transformers.commands.run import RunCommand from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands def main(): parser = ArgumentParser("Transformers CLI tool", usage="transformers1-cli []") commands_parser = parser.add_subparsers(help="transformers1-cli command helpers") # Register commands ConvertCommand.register_subcommand(commands_parser) DownloadCommand.register_subcommand(commands_parser) EnvironmentCommand.register_subcommand(commands_parser) RunCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser) # Let's go args = parser.parse_args() if not hasattr(args, "func"): parser.print_help() exit(1) # Run service = args.func(args) service.run() if __name__ == "__main__": main() ================================================ FILE: code/bert-base-count3/pretrain/transformers1/commands/user.py ================================================ import os import sys from argparse import ArgumentParser from getpass import getpass from typing import List, Union from requests.exceptions import HTTPError from transformers.commands import BaseTransformersCLICommand from transformers.hf_api import HfApi, HfFolder UPLOAD_MAX_FILES = 15 class UserCommands(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): login_parser = parser.add_parser("login", help="Log in using the same credentials as on huggingface.co") login_parser.set_defaults(func=lambda args: LoginCommand(args)) whoami_parser = parser.add_parser("whoami", help="Find out which huggingface.co account you are logged in as.") whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args)) logout_parser = parser.add_parser("logout", help="Log out") logout_parser.set_defaults(func=lambda args: LogoutCommand(args)) # s3 s3_parser = parser.add_parser("s3", help="{ls, rm} Commands to interact with the files you upload on S3.") s3_subparsers = s3_parser.add_subparsers(help="s3 related commands") ls_parser = s3_subparsers.add_parser("ls") ls_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") ls_parser.set_defaults(func=lambda args: ListObjsCommand(args)) rm_parser = s3_subparsers.add_parser("rm") rm_parser.add_argument("filename", type=str, help="individual object filename to delete from S3.") rm_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") rm_parser.set_defaults(func=lambda args: DeleteObjCommand(args)) # upload upload_parser = parser.add_parser("upload", help="Upload a model to S3.") upload_parser.add_argument( "path", type=str, help="Local path of the model folder or individual file to upload." ) upload_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") upload_parser.add_argument( "--filename", type=str, default=None, help="Optional: override individual object filename on S3." ) upload_parser.set_defaults(func=lambda args: UploadCommand(args)) class ANSI: """ Helper for en.wikipedia.org/wiki/ANSI_escape_code """ _bold = "\u001b[1m" _red = "\u001b[31m" _reset = "\u001b[0m" @classmethod def bold(cls, s): return "{}{}{}".format(cls._bold, s, cls._reset) @classmethod def red(cls, s): return "{}{}{}".format(cls._bold + cls._red, s, cls._reset) class BaseUserCommand: def __init__(self, args): self.args = args self._api = HfApi() class LoginCommand(BaseUserCommand): def run(self): print( """ _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_| _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_| """ ) username = input("Username: ") password = getpass() try: token = self._api.login(username, password) except HTTPError as e: # probably invalid credentials, display error message. print(e) print(ANSI.red(e.response.text)) exit(1) HfFolder.save_token(token) print("Login successful") print("Your token:", token, "\n") print("Your token has been saved to", HfFolder.path_token) class WhoamiCommand(BaseUserCommand): def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit() try: user, orgs = self._api.whoami(token) print(user) if orgs: print(ANSI.bold("orgs: "), ",".join(orgs)) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) class LogoutCommand(BaseUserCommand): def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit() HfFolder.delete_token() self._api.logout(token) print("Successfully logged out.") class ListObjsCommand(BaseUserCommand): def tabulate(self, rows: List[List[Union[str, int]]], headers: List[str]) -> str: """ Inspired by: stackoverflow.com/a/8356620/593036 stackoverflow.com/questions/9535954/printing-lists-as-tabular-data """ col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)] row_format = ("{{:{}}} " * len(headers)).format(*col_widths) lines = [] lines.append(row_format.format(*headers)) lines.append(row_format.format(*["-" * w for w in col_widths])) for row in rows: lines.append(row_format.format(*row)) return "\n".join(lines) def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit(1) try: objs = self._api.list_objs(token, organization=self.args.organization) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) if len(objs) == 0: print("No shared file yet") exit() rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs] print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])) class DeleteObjCommand(BaseUserCommand): def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit(1) try: self._api.delete_obj(token, filename=self.args.filename, organization=self.args.organization) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) print("Done") class UploadCommand(BaseUserCommand): def walk_dir(self, rel_path): """ Recursively list all files in a folder. """ entries: List[os.DirEntry] = list(os.scandir(rel_path)) files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()] # (filepath, filename) for f in entries: if f.is_dir(): files += self.walk_dir(f.path) return files def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit(1) local_path = os.path.abspath(self.args.path) if os.path.isdir(local_path): if self.args.filename is not None: raise ValueError("Cannot specify a filename override when uploading a folder.") rel_path = os.path.basename(local_path) files = self.walk_dir(rel_path) elif os.path.isfile(local_path): filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path) files = [(local_path, filename)] else: raise ValueError("Not a valid file or directory: {}".format(local_path)) if sys.platform == "win32": files = [(filepath, filename.replace(os.sep, "/")) for filepath, filename in files] if len(files) > UPLOAD_MAX_FILES: print( "About to upload {} files to S3. This is probably wrong. Please filter files before uploading.".format( ANSI.bold(len(files)) ) ) exit(1) user, _ = self._api.whoami(token) namespace = self.args.organization if self.args.organization is not None else user for filepath, filename in files: print( "About to upload file {} to S3 under filename {} and namespace {}".format( ANSI.bold(filepath), ANSI.bold(filename), ANSI.bold(namespace) ) ) choice = input("Proceed? [Y/n] ").lower() if not (choice == "" or choice == "y" or choice == "yes"): print("Abort") exit() print(ANSI.bold("Uploading... This might take a while if files are large")) for filepath, filename in files: try: access_url = self._api.presign_and_upload( token=token, filename=filename, filepath=filepath, organization=self.args.organization ) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) print("Your file now lives at:") print(access_url) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_albert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ ALBERT model configuration """ from .configuration_utils import PretrainedConfig ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-config.json", "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-config.json", "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-config.json", "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-config.json", "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json", "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json", "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json", "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json", } class AlbertConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.AlbertModel`. It is used to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALBERT `xxlarge `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30000): Vocabulary size of the ALBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.AlbertModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of vocabulary embeddings. hidden_size (:obj:`int`, optional, defaults to 4096): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_hidden_groups (:obj:`int`, optional, defaults to 1): Number of groups for the hidden layers, parameters in the same group are shared. num_attention_heads (:obj:`int`, optional, defaults to 64): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 16384): The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. inner_group_num (:obj:`int`, optional, defaults to 1): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers1.AlbertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for attached classifiers. Example:: from transformers1 import AlbertConfig, AlbertModel # Initializing an ALBERT-xxlarge style configuration albert_xxlarge_configuration = AlbertConfig() # Initializing an ALBERT-base style configuration albert_base_configuration = AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) # Initializing a model from the ALBERT-base style configuration model = AlbertModel(albert_xxlarge_configuration) # Accessing the model configuration configuration = model.config """ model_type = "albert" def __init__( self, vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, num_hidden_groups=1, num_attention_heads=64, intermediate_size=16384, inner_group_num=1, hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, pad_token_id=0, bos_token_id=2, eos_token_id=3, **kwargs ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_hidden_groups = num_hidden_groups self.num_attention_heads = num_attention_heads self.inner_group_num = inner_group_num self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.classifier_dropout_prob = classifier_dropout_prob ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Config class. """ import logging from collections import OrderedDict from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig from .configuration_encoder_decoder import EncoderDecoderConfig from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig from .configuration_marian import MarianConfig from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig from .configuration_reformer import ReformerConfig from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig from .configuration_utils import PretrainedConfig from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig logger = logging.getLogger(__name__) ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( (key, value) for pretrained_map in [ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BART_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ] for key, value, in pretrained_map.items() ) CONFIG_MAPPING = OrderedDict( [ ("t5", T5Config,), ("distilbert", DistilBertConfig,), ("albert", AlbertConfig,), ("camembert", CamembertConfig,), ("xlm-roberta", XLMRobertaConfig,), ("marian", MarianConfig,), ("bart", BartConfig,), ("reformer", ReformerConfig,), ("longformer", LongformerConfig,), ("roberta", RobertaConfig,), ("flaubert", FlaubertConfig,), ("bert", BertConfig,), ("openai-gpt", OpenAIGPTConfig,), ("gpt2", GPT2Config,), ("transfo-xl", TransfoXLConfig,), ("xlnet", XLNetConfig,), ("xlm", XLMConfig,), ("ctrl", CTRLConfig,), ("electra", ElectraConfig,), ("encoder-decoder", EncoderDecoderConfig,), ] ) class AutoConfig: r""" :class:`~transformers1.AutoConfig` is a generic configuration class that will be instantiated as one of the configuration classes of the library when created with the :func:`~transformers1.AutoConfig.from_pretrained` class method. The :func:`~transformers1.AutoConfig.from_pretrained` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string. """ def __init__(self): raise EnvironmentError( "AutoConfig is designed to be instantiated " "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method." ) @classmethod def for_model(cls, model_type: str, *args, **kwargs): if model_type in CONFIG_MAPPING: config_class = CONFIG_MAPPING[model_type] return config_class(*args, **kwargs) raise ValueError( "Unrecognized model identifier: {}. Should contain one of {}".format( model_type, ", ".join(CONFIG_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiates one of the configuration classes of the library from a pre-trained model configuration. The configuration class to instantiate is selected based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5Config` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertConfig` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertConfig` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertConfig` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaConfig` (XLM-RoBERTa model) - `longformer`: :class:`~transformers1.LongformerConfig` (Longformer model) - `roberta`: :class:`~transformers1.RobertaConfig` (RoBERTa model) - `reformer`: :class:`~transformers1.ReformerConfig` (Reformer model) - `bert`: :class:`~transformers1.BertConfig` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTConfig` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2Config` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLConfig` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetConfig` (XLNet model) - `xlm`: :class:`~transformers1.XLMConfig` (XLM model) - `ctrl` : :class:`~transformers1.CTRLConfig` (CTRL model) - `flaubert` : :class:`~transformers1.FlaubertConfig` (Flaubert model) - `electra` : :class:`~transformers1.ElectraConfig` (ELECTRA model) Args: pretrained_model_name_or_path (:obj:`string`): Is either: \ - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers1.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. cache_dir (:obj:`string`, optional, defaults to `None`): Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download (:obj:`boolean`, optional, defaults to `False`): Force to (re-)download the model weights and configuration files and override the cached versions if they exist. resume_download (:obj:`boolean`, optional, defaults to `False`): Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies (:obj:`Dict[str, str]`, optional, defaults to `None`): A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. See `the requests documentation `__ for usage. return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`): - If False, then this function returns just the final configuration object. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading. - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. Examples:: config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) assert config.output_attention == True config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True) assert config.output_attention == True assert unused_kwargs == {'foo': False} """ config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) if "model_type" in config_dict: config_class = CONFIG_MAPPING[config_dict["model_type"]] return config_class.from_dict(config_dict, **kwargs) else: # Fallback: use pattern matching on the string. for pattern, config_class in CONFIG_MAPPING.items(): if pattern in pretrained_model_name_or_path: return config_class.from_dict(config_dict, **kwargs) raise ValueError( "Unrecognized model in {}. " "Should have a `model_type` key in its config.json, or contain one of the following strings " "in its name: {}".format(pretrained_model_name_or_path, ", ".join(CONFIG_MAPPING.keys())) ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_bart.py ================================================ # coding=utf-8 # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ BART configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) BART_PRETRAINED_CONFIG_ARCHIVE_MAP = { "facebook/bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json", "facebook/bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json", "facebook/bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json", "facebook/bart-large-xsum": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-xsum/config.json", "facebook/mbart-large-en-ro": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/config.json", } class BartConfig(PretrainedConfig): r""" Configuration class for Bart. Parameters are renamed from the fairseq implementation """ model_type = "bart" def __init__( self, activation_dropout=0.0, activation_function="gelu", vocab_size=50265, d_model=1024, encoder_ffn_dim=4096, encoder_layers=12, encoder_attention_heads=16, decoder_ffn_dim=4096, decoder_layers=12, decoder_attention_heads=16, encoder_layerdrop=0.0, decoder_layerdrop=0.0, attention_dropout=0.0, dropout=0.1, max_position_embeddings=1024, init_std=0.02, classifier_dropout=0.0, num_labels=3, is_encoder_decoder=True, pad_token_id=1, bos_token_id=0, eos_token_id=2, normalize_before=False, add_final_layer_norm=False, scale_embedding=False, normalize_embedding=True, static_position_embeddings=False, add_bias_logits=False, **common_kwargs ): r""" :class:`~transformers1.BartConfig` is the configuration class for `BartModel`. Examples: config = BartConfig.from_pretrained('bart-large') model = BartModel(config) """ if "hidden_size" in common_kwargs: raise ValueError("hidden size is called d_model") super().__init__( num_labels=num_labels, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **common_kwargs, ) self.vocab_size = vocab_size self.d_model = d_model # encoder_embed_dim and decoder_embed_dim self.encoder_ffn_dim = encoder_ffn_dim self.encoder_layers = self.num_hidden_layers = encoder_layers self.encoder_attention_heads = encoder_attention_heads self.encoder_layerdrop = encoder_layerdrop self.decoder_layerdrop = decoder_layerdrop self.decoder_ffn_dim = decoder_ffn_dim self.decoder_layers = decoder_layers self.decoder_attention_heads = decoder_attention_heads self.max_position_embeddings = max_position_embeddings self.init_std = init_std # Normal(0, this parameter) self.activation_function = activation_function # Params introduced for Mbart self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.normalize_embedding = normalize_embedding # True for mbart, False otherwise self.normalize_before = normalize_before # combo of fairseq's encoder_ and decoder_normalize_before self.add_final_layer_norm = add_final_layer_norm # Params introduced for Marian self.add_bias_logits = add_bias_logits self.static_position_embeddings = static_position_embeddings # 3 Types of Dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.dropout = dropout # Classifier stuff self.classif_dropout = classifier_dropout @property def num_attention_heads(self) -> int: return self.encoder_attention_heads @property def hidden_size(self) -> int: return self.d_model def is_valid_mbart(self) -> bool: """Is the configuration aligned with the MBART paper.""" if self.normalize_before and self.add_final_layer_norm and self.scale_embedding: return True if self.normalize_before or self.add_final_layer_norm or self.scale_embedding: logger.info("This configuration is a mixture of MBART and BART settings") return False ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ BERT model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", "cl-tohoku/bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/config.json", "cl-tohoku/bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/config.json", "cl-tohoku/bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/config.json", "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/config.json", "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json", "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json", "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json", # See all BERT models at https://huggingface.co/models?filter=bert } class BertConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.BertModel`. It is used to instantiate an BERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the BERT `bert-base-uncased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30522): Vocabulary size of the BERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.BertModel`. hidden_size (:obj:`int`, optional, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 3072): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers1.BertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. Example:: from transformers1 import BertModel, BertConfig # Initializing a BERT bert-base-uncased style configuration configuration = BertConfig() # Initializing a model from the bert-base-uncased style configuration model = BertModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "bert" def __init__( self, vocab_size=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, **kwargs ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_camembert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ CamemBERT configuration """ import logging from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", } class CamembertConfig(RobertaConfig): """ This class overrides :class:`~transformers1.RobertaConfig`. Please check the superclass for the appropriate documentation alongside usage examples. """ model_type = "camembert" ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Salesforce CTRL configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-config.json"} class CTRLConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.CTRLModel`. It is used to instantiate an CTRL model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `ctrl `__ architecture from SalesForce. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 246534): Vocabulary size of the CTRL model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.CTRLModel`. n_positions (:obj:`int`, optional, defaults to 256): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). n_ctx (:obj:`int`, optional, defaults to 256): Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, optional, defaults to 1280): Dimensionality of the embeddings and hidden states. dff (:obj:`int`, optional, defaults to 8192): Dimensionality of the inner dimension of the FFN. n_layer (:obj:`int`, optional, defaults to 48): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. resid_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, optional, defaults to 0.1): The dropout ratio for the embeddings. attn_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention. layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): The epsilon to use in the layer normalization layers initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Example:: from transformers1 import CTRLModel, CTRLConfig # Initializing a CTRL configuration configuration = CTRLConfig() # Initializing a model from the configuration model = CTRLModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "ctrl" def __init__( self, vocab_size=246534, n_positions=256, n_ctx=256, n_embd=1280, dff=8192, n_layer=48, n_head=16, resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-6, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, **kwargs ): super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.dff = dff self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.n_embd @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ DistilBERT model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json", "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-config.json", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", } class DistilBertConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.DistilBertModel`. It is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT `distilbert-base-uncased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30522): Vocabulary size of the DistilBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.BertModel`. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use sinusoidal positional embeddings. n_layers (:obj:`int`, optional, defaults to 6): Number of hidden layers in the Transformer encoder. n_heads (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. dim (:obj:`int`, optional, defaults to 768): Dimensionality of the encoder layers and the pooler layer. hidden_dim (:obj:`int`, optional, defaults to 3072): The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities. activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. qa_dropout (:obj:`float`, optional, defaults to 0.1): The dropout probabilities used in the question answering model :class:`~transformers1.DistilBertForQuestionAnswering`. seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): The dropout probabilities used in the sequence classification model :class:`~transformers1.DistilBertForSequenceClassification`. Example:: from transformers1 import DistilBertModel, DistilBertConfig # Initializing a DistilBERT configuration configuration = DistilBertConfig() # Initializing a model from the configuration model = DistilBertModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "distilbert" def __init__( self, vocab_size=30522, max_position_embeddings=512, sinusoidal_pos_embds=False, n_layers=6, n_heads=12, dim=768, hidden_dim=4 * 768, dropout=0.1, attention_dropout=0.1, activation="gelu", initializer_range=0.02, qa_dropout=0.1, seq_classif_dropout=0.2, pad_token_id=0, **kwargs ): super().__init__(**kwargs, pad_token_id=pad_token_id) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.sinusoidal_pos_embds = sinusoidal_pos_embds self.n_layers = n_layers self.n_heads = n_heads self.dim = dim self.hidden_dim = hidden_dim self.dropout = dropout self.attention_dropout = attention_dropout self.activation = activation self.initializer_range = initializer_range self.qa_dropout = qa_dropout self.seq_classif_dropout = seq_classif_dropout @property def hidden_size(self): return self.dim @property def num_attention_heads(self): return self.n_heads @property def num_hidden_layers(self): return self.n_layers ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_electra.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ ELECTRA model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = { "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json", "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json", "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json", "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json", "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json", "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json", } class ElectraConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.ElectraModel`. It is used to instantiate an ELECTRA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ELECTRA `google/electra-small-discriminator `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30522): Vocabulary size of the ELECTRA model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.ElectraModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of the encoder layers and the pooler layer. hidden_size (:obj:`int`, optional, defaults to 256): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (:obj:`int`, optional, defaults to 4): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 1024): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers1.ElectraModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. Example:: from transformers1 import ElectraModel, ElectraConfig # Initializing a ELECTRA electra-base-uncased style configuration configuration = ElectraConfig() # Initializing a model from the electra-base-uncased style configuration model = ElectraModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "electra" def __init__( self, vocab_size=30522, embedding_size=128, hidden_size=256, num_hidden_layers=12, num_attention_heads=4, intermediate_size=1024, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, **kwargs ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_encoder_decoder.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) class EncoderDecoderConfig(PretrainedConfig): r""" :class:`~transformers1.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`. It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. See the documentation for :class:`~transformers1.PretrainedConfig` for more information. Args: kwargs (`optional`): Remaining dictionary of keyword arguments. Notably: encoder (:class:`PretrainedConfig`, optional, defaults to `None`): An instance of a configuration object that defines the encoder config. encoder (:class:`PretrainedConfig`, optional, defaults to `None`): An instance of a configuration object that defines the decoder config. Example:: from transformers1 import BertConfig, EncoderDecoderConfig, EncoderDecoderModel # Initializing a BERT bert-base-uncased style configuration config_encoder = BertConfig() config_decoder = BertConfig() config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) # Initializing a Bert2Bert model from the bert-base-uncased style configurations model = EncoderDecoderModel(config=config) # Accessing the model configuration config_encoder = model.config.encoder config_decoder = model.config.decoder """ model_type = "encoder_decoder" def __init__(self, **kwargs): super().__init__(**kwargs) assert ( "encoder" in kwargs and "decoder" in kwargs ), "Config has to be initialized with encoder and decoder config" encoder_config = kwargs.pop("encoder") encoder_model_type = encoder_config.pop("model_type") decoder_config = kwargs.pop("decoder") decoder_model_type = decoder_config.pop("model_type") from transformers import AutoConfig self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) self.is_encoder_decoder = True @classmethod def from_encoder_decoder_configs( cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig ) -> PretrainedConfig: r""" Instantiate a :class:`~transformers1.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model configuration and decoder model configuration. Returns: :class:`EncoderDecoderConfig`: An instance of a configuration object """ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict()) def to_dict(self): """ Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`. Returns: :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) output["encoder"] = self.encoder.to_dict() output["decoder"] = self.decoder.to_dict() output["model_type"] = self.__class__.model_type return output ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Flaubert configuration, based on XLM. """ import logging from .configuration_xlm import XLMConfig logger = logging.getLogger(__name__) FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/config.json", "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/config.json", "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/config.json", "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/config.json", } class FlaubertConfig(XLMConfig): """ Configuration class to store the configuration of a `FlaubertModel`. This is the configuration class to store the configuration of a :class:`~transformers1.XLMModel`. It is used to instantiate an XLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `xlm-mlm-en-2048 `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to apply the layer normalization before or after the feed forward layer following the attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018) layerdrop (:obj:`float`, `optional`, defaults to 0.0): Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with Structured Dropout. ICLR 2020) vocab_size (:obj:`int`, optional, defaults to 30145): Vocabulary size of the Flaubert model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.FlaubertModel`. emb_dim (:obj:`int`, optional, defaults to 2048): Dimensionality of the encoder layers and the pooler layer. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for the attention mechanism gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): The non-linear activation function (function or string) in the encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. causal (:obj:`boolean`, optional, defaults to :obj:`False`): Set this to `True` for the model to behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context. asm (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer. n_langs (:obj:`int`, optional, defaults to 1): The number of languages the model handles. Set to 1 for monolingual models. use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual models page `__ for information on how to use them. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): The standard deviation of the truncated_normal_initializer for initializing the embedding matrices. init_std (:obj:`int`, optional, defaults to 50257): The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. bos_index (:obj:`int`, optional, defaults to 0): The index of the beginning of sentence token in the vocabulary. eos_index (:obj:`int`, optional, defaults to 1): The index of the end of sentence token in the vocabulary. pad_index (:obj:`int`, optional, defaults to 2): The index of the padding token in the vocabulary. unk_index (:obj:`int`, optional, defaults to 3): The index of the unknown token in the vocabulary. mask_index (:obj:`int`, optional, defaults to 5): The index of the masking token in the vocabulary. is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. summary_type (:obj:`string`, optional, defaults to "first"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a dropout before the projection and activation start_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. end_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. mask_token_id (:obj:`int`, optional, defaults to 0): Model agnostic parameter to identify masked tokens when generating text in an MLM context. lang_id (:obj:`int`, optional, defaults to 1): The ID of the language used by the model. This parameter is used when generating text in a given language. """ model_type = "flaubert" def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs): """Constructs FlaubertConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) self.layerdrop = layerdrop self.pre_norm = pre_norm ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ OpenAI GPT-2 configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json", "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json", "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json", } class GPT2Config(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.GPT2Model`. It is used to instantiate an GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the GPT-2 `small `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 50257): Vocabulary size of the GPT-2 model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.GPT2Model`. n_positions (:obj:`int`, optional, defaults to 1024): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). n_ctx (:obj:`int`, optional, defaults to 1024): Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, optional, defaults to 768): Dimensionality of the embeddings and hidden states. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. activation_function (:obj:`str`, optional, defaults to 'gelu'): Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"]. resid_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, optional, defaults to 0.1): The dropout ratio for the embeddings. attn_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention. layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): The epsilon to use in the layer normalization layers initializer_range (:obj:`float`, optional, defaults to 16): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. summary_type (:obj:`string`, optional, defaults to "cls_index"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. Add a dropout before the projection and activation Example:: from transformers1 import GPT2Model, GPT2Config # Initializing a GPT2 configuration configuration = GPT2Config() # Initializing a model from the configuration model = GPT2Model(configuration) # Accessing the model configuration configuration = model.config """ model_type = "gpt2" def __init__( self, vocab_size=50257, n_positions=1024, n_ctx=1024, n_embd=768, n_layer=12, n_head=12, activation_function="gelu_new", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, bos_token_id=50256, eos_token_id=50256, **kwargs ): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.n_embd @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_longformer.py ================================================ # coding=utf-8 # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Longformer configuration """ import logging from typing import List, Union from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { "allenai/longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/config.json", "allenai/longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/config.json", "allenai/longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/config.json", "allenai/longformer-base-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096-extra.pos.embd.only/config.json", "allenai/longformer-large-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-extra.pos.embd.only/config.json", } class LongformerConfig(RobertaConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.LongformerModel`. It is used to instantiate an Longformer model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa `roberta-base `__ architecture with a sequence length 4,096. The :class:`~transformers1.LongformerConfig` class directly inherits :class:`~transformers1.RobertaConfig`. It reuses the same defaults. Please check the parent class for more information. Args: attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512): Size of an attention window around each token. If :obj:`int`, use the same size for all layers. To specify a different window size for each layer, use a :obj:`List[int]` where ``len(attention_window) == num_hidden_layers``. Example:: from transformers1 import LongformerConfig, LongformerModel # Initializing a Longformer configuration configuration = LongformerConfig() # Initializing a model from the configuration model = LongformerModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "longformer" def __init__(self, attention_window: Union[List[int], int] = 512, sep_token_id: int = 2, **kwargs): super().__init__(**kwargs) self.attention_window = attention_window self.sep_token_id = sep_token_id ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_marian.py ================================================ # coding=utf-8 # Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Marian model configuration """ from .configuration_bart import BartConfig PRETRAINED_CONFIG_ARCHIVE_MAP = { "Helsinki-NLP/opus-mt-en-de": "https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/config.json", } class MarianConfig(BartConfig): model_type = "marian" ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_mmbt.py ================================================ # coding=utf-8 # Copyright (c) Facebook, Inc. and its affiliates. # Copyright (c) HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ MMBT configuration """ import logging logger = logging.getLogger(__name__) class MMBTConfig(object): """Configuration class to store the configuration of a `MMBT Model`. Args: config (:obj:`~transformers1.PreTrainedConfig`): Config of the underlying Transformer models. Its values are copied over to use a single config. num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): Size of final Linear layer for classification. modal_hidden_size (:obj:`int`, optional, defautls to 2048): Embedding dimension of the non-text modality encoder. """ def __init__(self, config, num_labels=None, modal_hidden_size=2048): self.__dict__ = config.__dict__ self.modal_hidden_size = modal_hidden_size if num_labels: self.num_labels = num_labels ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_openai.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ OpenAI GPT configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" } class OpenAIGPTConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.OpenAIGPTModel`. It is used to instantiate an GPT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `GPT `__ architecture from OpenAI. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 40478): Vocabulary size of the GPT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.CTRLModel`. n_positions (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). n_ctx (:obj:`int`, optional, defaults to 512): Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, optional, defaults to 768): Dimensionality of the embeddings and hidden states. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. resid_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, optional, defaults to 0.1): The dropout ratio for the embeddings. attn_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention. layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): The epsilon to use in the layer normalization layers initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`): Whether special tokens should be predicted when the model is has a language modeling head. summary_type (:obj:`string`, optional, defaults to "cls_index"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. Add a dropout before the projection and activation Example:: from transformers1 import OpenAIGPTConfig, OpenAIGPTModel # Initializing a GPT configuration configuration = OpenAIGPTConfig() # Initializing a model from the configuration model = OpenAIGPTModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "openai-gpt" def __init__( self, vocab_size=40478, n_positions=512, n_ctx=512, n_embd=768, n_layer=12, n_head=12, afn="gelu", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, predict_special_tokens=True, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, **kwargs ): super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.afn = afn self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.predict_special_tokens = predict_special_tokens self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.n_embd @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_reformer.py ================================================ # coding=utf-8 # Copyright 2020 The Trax Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Reformer model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/config.json", "google/reformer-enwik8": "https://cdn.huggingface.co/google/reformer-enwik8/config.json", } class ReformerConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.ReformerModel`. It is used to instantiate an Reformer model according to the specified arguments, defining the model architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: attention_head_size (:obj:`int`, optional, defaults to 64): Dimensionality of the projected key, query and value vectors attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]): List of attention layer types in ascending order. It can be chosen between a LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local"). For more information on LSHSelfAttention layer, see `LSH Self Attention `__ . For more information on LocalSelfAttention layer, see `Local Self Attention `__ . axial_pos_embds (:obj:`bool`, optional, defaults to True): If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings `__ axial_norm_std (:obj:`float`, optional, defaluts to 1.0): The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings. axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`): The position dims of the axial position encodings. During training the product of the position dims has to equal the sequence length. For more information on how axial position embeddings work, see `Axial Position Encodings `__. axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`): The embedding dims of the axial position encodings. The sum of the embedding dims has to equal the hidden size. For more information on how axial position embeddings work, see `Axial Position Encodings `__. chunk_size_lm_head (:obj:`int`, optional, defaults to 0): The chunk size of the final language model feed forward head layer. A chunk size of 0 means that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time. For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ . chunk_size_feed_forward (:obj:`int`, optional, defaults to 0): The chunk size of all feed forward layers in the residual attention blocks. A chunk size of 0 means that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time. For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ . eos_token_id (:obj:`int`, optional, defaults to 2): The token id for the token. feed_forward_size (:obj:`int`, optional, defaults to 512): Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block. hash_seed (:obj:`int`, optional, defaults to `None`): Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"): The non-linear activation function (function or string) in the feed forward layer in the residual attention block. If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. hidden_size (:obj:`int`, optional, defaults to 256): Dimensionality of the output hidden states of the residual attention blocks. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. is_decoder (:obj:`bool`, optional, defaults to False): If `is_decoder` is True, a causal mask is used in addition to `attention_mask`. When using the Reformer for causal language modeling, `is_decoder` is set to `True`. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. local_chunk_length (:obj:`int`, optional, defaults to 64): Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention). local_num_chunks_before (:obj:`int`, optional, defaults to 1): Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself. local_num_chunks_after (:obj:`int`, optional, defaults to 0): Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself. local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities in LocalSelfAttention. lsh_chunk_length (:obj:`int`, optional, defaults to 64): Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention). lsh_num_chunks_before (:obj:`int`, optional, defaults to 1): Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself. lsh_num_chunks_after (:obj:`int`, optional, defaults to 0): Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself. lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities in LSHSelfAttention. max_position_embeddings (:obj:`int`, optional, defaults to 4096): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). num_attention_heads (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`): Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`. The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors. The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly. num_hashes (:obj:`int`, optional, defaults to 1): Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme. The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes. pad_token_id (:obj:`int`, optional, defaults to 0): The token id for the token. vocab_size (:obj:`int`, optional, defaults to 320): Vocabulary size of the Reformer model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.ReformerModel`. Example:: from transformers1 import ReformerModel, ReformerConfig # Initializing a Reformer configuration configuration = ReformerConfig() # Initializing a Reformer model model = ReformerModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "reformer" def __init__( self, attention_head_size=64, attn_layers=["local", "lsh", "local", "lsh", "local", "lsh"], axial_norm_std=1.0, axial_pos_embds=True, axial_pos_shape=[64, 64], axial_pos_embds_dim=[64, 192], chunk_size_lm_head=0, chunk_size_feed_forward=0, eos_token_id=2, feed_forward_size=512, hash_seed=None, hidden_act="relu", hidden_dropout_prob=0.05, hidden_size=256, initializer_range=0.02, is_decoder=False, layer_norm_eps=1e-12, local_num_chunks_before=1, local_num_chunks_after=0, local_attention_probs_dropout_prob=0.05, local_attn_chunk_length=64, lsh_attn_chunk_length=64, lsh_attention_probs_dropout_prob=0.0, lsh_num_chunks_before=1, lsh_num_chunks_after=0, max_position_embeddings=4096, num_attention_heads=2, num_buckets=None, num_hashes=1, pad_token_id=0, vocab_size=320, **kwargs ): super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_decoder=is_decoder, **kwargs) self.hash_seed = hash_seed self.vocab_size = vocab_size self.attention_head_size = attention_head_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.num_hashes = num_hashes self.num_hidden_layers = len(attn_layers) self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets self.lsh_attn_chunk_length = lsh_attn_chunk_length self.local_attn_chunk_length = local_attn_chunk_length self.lsh_num_chunks_after = lsh_num_chunks_after self.lsh_num_chunks_before = lsh_num_chunks_before self.local_num_chunks_after = local_num_chunks_after self.local_num_chunks_before = local_num_chunks_before self.hidden_act = hidden_act self.feed_forward_size = feed_forward_size self.hidden_dropout_prob = hidden_dropout_prob self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.axial_pos_embds = axial_pos_embds self.axial_pos_shape = tuple(axial_pos_shape) self.axial_pos_embds_dim = tuple(axial_pos_embds_dim) self.axial_norm_std = axial_norm_std self.chunk_size_lm_head = chunk_size_lm_head self.chunk_size_feed_forward = chunk_size_feed_forward self.attn_layers = attn_layers ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ RoBERTa configuration """ import logging from .configuration_bert import BertConfig logger = logging.getLogger(__name__) ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", } class RobertaConfig(BertConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.RobertaModel`. It is used to instantiate an RoBERTa model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the BERT `bert-base-uncased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. The :class:`~transformers1.RobertaConfig` class directly inherits :class:`~transformers1.BertConfig`. It reuses the same defaults. Please check the parent class for more information. Example:: from transformers1 import RobertaConfig, RobertaModel # Initializing a RoBERTa configuration configuration = RobertaConfig() # Initializing a model from the configuration model = RobertaModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "roberta" def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): """Constructs RobertaConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_t5.py ================================================ # coding=utf-8 # Copyright 2010, The T5 Authors and HuggingFace Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ T5 model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", } class T5Config(PretrainedConfig): r""" :class:`~transformers1.T5Config` is the configuration class to store the configuration of a `T5Model`. Arguments: vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`. d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`. num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`. num_heads: Number of attention heads for each attention layer in the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`. intermediate_size: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob: The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. n_positions: The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings'. type_vocab_size: The vocabulary size of the `token_type_ids` passed into `T5Model`. initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). layer_norm_eps: The epsilon used by LayerNorm. """ model_type = "t5" def __init__( self, vocab_size=32128, n_positions=512, d_model=512, d_kv=64, d_ff=2048, num_layers=6, num_heads=8, relative_attention_num_buckets=32, dropout_rate=0.1, layer_norm_epsilon=1e-6, initializer_factor=1.0, is_encoder_decoder=True, pad_token_id=0, eos_token_id=1, **kwargs ): super().__init__( pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs, ) self.vocab_size = vocab_size self.n_positions = n_positions self.d_model = d_model self.d_kv = d_kv self.d_ff = d_ff self.num_layers = num_layers self.num_heads = num_heads self.relative_attention_num_buckets = relative_attention_num_buckets self.dropout_rate = dropout_rate self.layer_norm_epsilon = layer_norm_epsilon self.initializer_factor = initializer_factor @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.d_model @property def num_attention_heads(self): return self.num_heads @property def num_hidden_layers(self): return self.num_layers ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Transformer XL configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", } class TransfoXLConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.TransfoXLModel`. It is used to instantiate a Transformer XL model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `Transformer XL `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 267735): Vocabulary size of the Transformer XL model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.TransfoXLModel`. cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`): Cutoffs for the adaptive softmax d_model (:obj:`int`, optional, defaults to 1024): Dimensionality of the model's hidden states. d_embed (:obj:`int`, optional, defaults to 1024): Dimensionality of the embeddings n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. d_head (:obj:`int`, optional, defaults to 64): Dimensionality of the model's heads. d_inner (:obj:`int`, optional, defaults to 4096): Inner dimension in FF div_val (:obj:`int`, optional, defaults to 4): Divident value for adapative input and softmax pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`): Apply LayerNorm to the input instead of the output n_layer (:obj:`int`, optional, defaults to 18): Number of hidden layers in the Transformer encoder. tgt_len (:obj:`int`, optional, defaults to 128): Number of tokens to predict ext_len (:obj:`int`, optional, defaults to 0): Length of the extended context mem_len (:obj:`int`, optional, defaults to 1600): Length of the retained previous heads clamp_len (:obj:`int`, optional, defaults to 1000): use the same pos embeddings after clamp_len same_length (:obj:`boolean`, optional, defaults to :obj:`True`): Use the same attn length for all tokens proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`): True to share all but first projs, False not to share. attn_type (:obj:`int`, optional, defaults to 0): Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. sample_softmax (:obj:`int`, optional, defaults to -1): number of samples in sampled softmax adaptive (:obj:`boolean`, optional, defaults to :obj:`True`): use adaptive softmax tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`): tie the word embedding and softmax weights dropout (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. dropatt (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): Untie relative position biases init (:obj:`string`, optional, defaults to `normal`): Parameter initializer to use init_range (:obj:`float`, optional, defaults to 0.01): Parameters initialized by U(-init_range, init_range). proj_init_std (:obj:`float`, optional, defaults to 0.01): Parameters initialized by N(0, init_std) init_std (:obj:`float`, optional, defaults to 0.02): Parameters initialized by N(0, init_std) layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): The epsilon to use in the layer normalization layers Example:: from transformers1 import TransfoXLConfig, TransfoXLModel # Initializing a Transformer XL configuration configuration = TransfoXLConfig() # Initializing a model from the configuration model = TransfoXLModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "transfo-xl" def __init__( self, vocab_size=267735, cutoffs=[20000, 40000, 200000], d_model=1024, d_embed=1024, n_head=16, d_head=64, d_inner=4096, div_val=4, pre_lnorm=False, n_layer=18, tgt_len=128, ext_len=0, mem_len=1600, clamp_len=1000, same_length=True, proj_share_all_but_first=True, attn_type=0, sample_softmax=-1, adaptive=True, tie_weight=True, dropout=0.1, dropatt=0.0, untie_r=True, init="normal", init_range=0.01, proj_init_std=0.01, init_std=0.02, layer_norm_epsilon=1e-5, eos_token_id=0, **kwargs ): super().__init__(eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.cutoffs = [] self.cutoffs.extend(cutoffs) self.tie_weight = tie_weight if proj_share_all_but_first: self.tie_projs = [False] + [True] * len(self.cutoffs) else: self.tie_projs = [False] + [False] * len(self.cutoffs) self.d_model = d_model self.d_embed = d_embed self.d_head = d_head self.d_inner = d_inner self.div_val = div_val self.pre_lnorm = pre_lnorm self.n_layer = n_layer self.n_head = n_head self.tgt_len = tgt_len self.ext_len = ext_len self.mem_len = mem_len self.same_length = same_length self.attn_type = attn_type self.clamp_len = clamp_len self.sample_softmax = sample_softmax self.adaptive = adaptive self.dropout = dropout self.dropatt = dropatt self.untie_r = untie_r self.init = init self.init_range = init_range self.proj_init_std = proj_init_std self.init_std = init_std self.layer_norm_epsilon = layer_norm_epsilon @property def max_position_embeddings(self): return self.tgt_len + self.ext_len + self.mem_len @property def n_token(self): # Backward compatibility return self.vocab_size @n_token.setter def n_token(self, value): # Backward compatibility self.vocab_size = value @property def hidden_size(self): return self.d_model @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Configuration base class and utilities.""" import copy import json import logging import os from typing import Dict, Tuple from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url logger = logging.getLogger(__name__) class PretrainedConfig(object): r""" Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. Note: A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. It only affects the model's configuration. Class attributes (overridden by derived classes): - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers1.AutoConfig`. Args: finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`): Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. num_labels (:obj:`int`, `optional`, defaults to `2`): Number of classes to use when the model is a classification model (sequences/tokens) output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`): Should the model returns attentions weights. output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`): Should the model returns all hidden-states. torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`): Is the model used with Torchscript (for PyTorch models). """ model_type: str = "" def __init__(self, **kwargs): # Attributes with defaults self.output_attentions = kwargs.pop("output_attentions", False) self.output_hidden_states = kwargs.pop("output_hidden_states", False) self.use_cache = kwargs.pop("use_cache", True) # Not used by all models self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models self.use_bfloat16 = kwargs.pop("use_bfloat16", False) self.pruned_heads = kwargs.pop("pruned_heads", {}) # Is decoder is used in encoder-decoder models to differentiate encoder from decoder self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False) self.is_decoder = kwargs.pop("is_decoder", False) # Parameters for sequence generation self.max_length = kwargs.pop("max_length", 20) self.min_length = kwargs.pop("min_length", 0) self.do_sample = kwargs.pop("do_sample", False) self.early_stopping = kwargs.pop("early_stopping", False) self.num_beams = kwargs.pop("num_beams", 1) self.temperature = kwargs.pop("temperature", 1.0) self.top_k = kwargs.pop("top_k", 50) self.top_p = kwargs.pop("top_p", 1.0) self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) self.length_penalty = kwargs.pop("length_penalty", 1.0) self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0) self.bad_words_ids = kwargs.pop("bad_words_ids", None) self.num_return_sequences = kwargs.pop("num_return_sequences", 1) # Fine-tuning task arguments self.architectures = kwargs.pop("architectures", None) self.finetuning_task = kwargs.pop("finetuning_task", None) self.id2label = kwargs.pop("id2label", None) self.label2id = kwargs.pop("label2id", None) if self.id2label is not None: kwargs.pop("num_labels", None) self.id2label = dict((int(key), value) for key, value in self.id2label.items()) # Keys are always strings in JSON so convert ids to int here. else: self.num_labels = kwargs.pop("num_labels", 2) # Tokenizer arguments TODO: eventually tokenizer and models should share the same config self.prefix = kwargs.pop("prefix", None) self.bos_token_id = kwargs.pop("bos_token_id", None) self.pad_token_id = kwargs.pop("pad_token_id", None) self.eos_token_id = kwargs.pop("eos_token_id", None) self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) # task specific arguments self.task_specific_params = kwargs.pop("task_specific_params", None) # TPU arguments self.xla_device = kwargs.pop("xla_device", None) # Additional attributes without default values for key, value in kwargs.items(): try: setattr(self, key, value) except AttributeError as err: logger.error("Can't set {} with value {} for {}".format(key, value, self)) raise err @property def num_labels(self): return len(self.id2label) @num_labels.setter def num_labels(self, num_labels): self.id2label = {i: "LABEL_{}".format(i) for i in range(num_labels)} self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the :func:`~transformers1.PretrainedConfig.from_pretrained` class method. Args: save_directory (:obj:`string`): Directory where the configuration JSON file will be saved. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(save_directory, CONFIG_NAME) self.to_json_file(output_config_file, use_diff=True) logger.info("Configuration saved in {}".format(output_config_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig": r""" Instantiate a :class:`~transformers1.PretrainedConfig` (or a derived class) from a pre-trained model configuration. Args: pretrained_model_name_or_path (:obj:`string`): either: - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers1.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. cache_dir (:obj:`string`, `optional`): Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. kwargs (:obj:`Dict[str, any]`, `optional`): The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): Force to (re-)download the model weights and configuration files and override the cached versions if they exist. resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies (:obj:`Dict`, `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. return_unused_kwargs: (`optional`) bool: If False, then this function returns just the final configuration object. If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. Returns: :class:`PretrainedConfig`: An instance of a configuration object Examples:: # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a # derived class: BertConfig config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) assert config.output_attention == True config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True) assert config.output_attention == True assert unused_kwargs == {'foo': False} """ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) return cls.from_dict(config_dict, **kwargs) @classmethod def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[Dict, Dict]: """ From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a Config using `from_dict`. Parameters: pretrained_model_name_or_path (:obj:`string`): The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. Returns: :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object. """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) if os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): config_file = pretrained_model_name_or_path else: config_file = hf_bucket_url(pretrained_model_name_or_path, filename=CONFIG_NAME, use_cdn=False) try: # Load from URL or cache if already cached resolved_config_file = cached_path( config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) # Load config dict if resolved_config_file is None: raise EnvironmentError config_dict = cls._dict_from_json_file(resolved_config_file) except EnvironmentError: msg = ( f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" ) raise EnvironmentError(msg) except json.JSONDecodeError: msg = ( "Couldn't reach server at '{}' to download configuration file or " "configuration file is not a valid JSON file. " "Please check network or file content here: {}.".format(config_file, resolved_config_file) ) raise EnvironmentError(msg) if resolved_config_file == config_file: logger.info("loading configuration file {}".format(config_file)) else: logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file)) return config_dict, kwargs @classmethod def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig": """ Constructs a `Config` from a Python dictionary of parameters. Args: config_dict (:obj:`Dict[str, any]`): Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved from a pre-trained checkpoint by leveraging the :func:`~transformers1.PretrainedConfig.get_config_dict` method. kwargs (:obj:`Dict[str, any]`): Additional parameters from which to initialize the configuration object. Returns: :class:`PretrainedConfig`: An instance of a configuration object """ return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) config = cls(**config_dict) if hasattr(config, "pruned_heads"): config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) # Update config with kwargs if needed to_remove = [] for key, value in kwargs.items(): if hasattr(config, key): setattr(config, key, value) to_remove.append(key) for key in to_remove: kwargs.pop(key, None) logger.info("Model config %s", str(config)) if return_unused_kwargs: return config, kwargs else: return config @classmethod def from_json_file(cls, json_file: str) -> "PretrainedConfig": """ Constructs a `Config` from the path to a json file of parameters. Args: json_file (:obj:`string`): Path to the JSON file containing the parameters. Returns: :class:`PretrainedConfig`: An instance of a configuration object """ config_dict = cls._dict_from_json_file(json_file) return cls(**config_dict) @classmethod def _dict_from_json_file(cls, json_file: str): with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() return json.loads(text) def __eq__(self, other): return self.__dict__ == other.__dict__ def __repr__(self): return "{} {}".format(self.__class__.__name__, self.to_json_string()) def to_diff_dict(self): """ Removes all attributes from config which correspond to the default config attributes for better readability and serializes to a Python dictionary. Returns: :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ config_dict = self.to_dict() # get the default config dict default_config_dict = PretrainedConfig().to_dict() serializable_config_dict = {} # only serialize values that differ from the default config for key, value in config_dict.items(): if key not in default_config_dict or value != default_config_dict[key]: serializable_config_dict[key] = value return serializable_config_dict def to_dict(self): """ Serializes this instance to a Python dictionary. Returns: :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) if hasattr(self.__class__, "model_type"): output["model_type"] = self.__class__.model_type return output def to_json_string(self, use_diff=True): """ Serializes this instance to a JSON string. Args: use_diff (:obj:`bool`): If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON string. Returns: :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format. """ if use_diff is True: config_dict = self.to_diff_dict() else: config_dict = self.to_dict() return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" def to_json_file(self, json_file_path, use_diff=True): """ Save this instance to a json file. Args: json_file_path (:obj:`string`): Path to the JSON file in which this configuration instance's parameters will be saved. use_diff (:obj:`bool`): If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON file. """ with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string(use_diff=use_diff)) def update(self, config_dict: Dict): """ Updates attributes of this class with attributes from `config_dict`. Args: :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class. """ for key, value in config_dict.items(): setattr(self, key, value) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_xlm.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XLM configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", } class XLMConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.XLMModel`. It is used to instantiate an XLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `xlm-mlm-en-2048 `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30145): Vocabulary size of the XLM model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.XLMModel`. emb_dim (:obj:`int`, optional, defaults to 2048): Dimensionality of the encoder layers and the pooler layer. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for the attention mechanism gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): The non-linear activation function (function or string) in the encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. causal (:obj:`boolean`, optional, defaults to :obj:`False`): Set this to `True` for the model to behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context. asm (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer. n_langs (:obj:`int`, optional, defaults to 1): The number of languages the model handles. Set to 1 for monolingual models. use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual models page `__ for information on how to use them. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): The standard deviation of the truncated_normal_initializer for initializing the embedding matrices. init_std (:obj:`int`, optional, defaults to 50257): The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. bos_index (:obj:`int`, optional, defaults to 0): The index of the beginning of sentence token in the vocabulary. eos_index (:obj:`int`, optional, defaults to 1): The index of the end of sentence token in the vocabulary. pad_index (:obj:`int`, optional, defaults to 2): The index of the padding token in the vocabulary. unk_index (:obj:`int`, optional, defaults to 3): The index of the unknown token in the vocabulary. mask_index (:obj:`int`, optional, defaults to 5): The index of the masking token in the vocabulary. is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. summary_type (:obj:`string`, optional, defaults to "first"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a dropout before the projection and activation start_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. end_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. mask_token_id (:obj:`int`, optional, defaults to 0): Model agnostic parameter to identify masked tokens when generating text in an MLM context. lang_id (:obj:`int`, optional, defaults to 1): The ID of the language used by the model. This parameter is used when generating text in a given language. Example:: from transformers1 import XLMConfig, XLMModel # Initializing a XLM configuration configuration = XLMConfig() # Initializing a model from the configuration model = XLMModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "xlm" def __init__( self, vocab_size=30145, emb_dim=2048, n_layers=12, n_heads=16, dropout=0.1, attention_dropout=0.1, gelu_activation=True, sinusoidal_embeddings=False, causal=False, asm=False, n_langs=1, use_lang_emb=True, max_position_embeddings=512, embed_init_std=2048 ** -0.5, layer_norm_eps=1e-12, init_std=0.02, bos_index=0, eos_index=1, pad_index=2, unk_index=3, mask_index=5, is_encoder=True, summary_type="first", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, start_n_top=5, end_n_top=5, mask_token_id=0, lang_id=0, pad_token_id=2, bos_token_id=0, **kwargs ): """Constructs XLMConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) self.vocab_size = vocab_size self.emb_dim = emb_dim self.n_layers = n_layers self.n_heads = n_heads self.dropout = dropout self.attention_dropout = attention_dropout self.gelu_activation = gelu_activation self.sinusoidal_embeddings = sinusoidal_embeddings self.causal = causal self.asm = asm self.n_langs = n_langs self.use_lang_emb = use_lang_emb self.layer_norm_eps = layer_norm_eps self.bos_index = bos_index self.eos_index = eos_index self.pad_index = pad_index self.unk_index = unk_index self.mask_index = mask_index self.is_encoder = is_encoder self.max_position_embeddings = max_position_embeddings self.embed_init_std = embed_init_std self.init_std = init_std self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_proj_to_labels = summary_proj_to_labels self.summary_first_dropout = summary_first_dropout self.start_n_top = start_n_top self.end_n_top = end_n_top self.mask_token_id = mask_token_id self.lang_id = lang_id if "n_words" in kwargs: self.n_words = kwargs["n_words"] @property def n_words(self): # For backward compatibility return self.vocab_size @n_words.setter def n_words(self, value): # For backward compatibility self.vocab_size = value @property def hidden_size(self): return self.emb_dim @property def num_attention_heads(self): return self.n_heads @property def num_hidden_layers(self): return self.n_layers ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XLM-RoBERTa configuration """ import logging from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", } class XLMRobertaConfig(RobertaConfig): """ This class overrides :class:`~transformers1.RobertaConfig`. Please check the superclass for the appropriate documentation alongside usage examples. """ model_type = "xlm-roberta" ================================================ FILE: code/bert-base-count3/pretrain/transformers1/configuration_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XLNet configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", } class XLNetConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.XLNetModel`. It is used to instantiate an XLNet model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `xlnet-large-cased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 32000): Vocabulary size of the XLNet model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.XLNetModel`. d_model (:obj:`int`, optional, defaults to 1024): Dimensionality of the encoder layers and the pooler layer. n_layer (:obj:`int`, optional, defaults to 24): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. d_inner (:obj:`int`, optional, defaults to 4096): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. ff_activation (:obj:`string`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): Untie relative position biases attn_type (:obj:`string`, optional, defaults to "bi"): The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous forward pass won't be re-computed. See the `quickstart `__ for more information. reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): The number of tokens in the current batch to be cached and reused in the future. bi_data (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use bidirectional input pipeline. Usually set to `True` during pretraining and `False` during finetuning. clamp_len (:obj:`int`, optional, defaults to -1): Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping. same_length (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use the same attention length for each token. summary_type (:obj:`string`, optional, defaults to "last"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_last_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. Add a dropout after the projection and activation start_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. end_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. Example:: from transformers1 import XLNetConfig, XLNetModel # Initializing a XLNet configuration configuration = XLNetConfig() # Initializing a model from the configuration model = XLNetModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "xlnet" def __init__( self, vocab_size=32000, d_model=1024, n_layer=24, n_head=16, d_inner=4096, ff_activation="gelu", untie_r=True, attn_type="bi", initializer_range=0.02, layer_norm_eps=1e-12, dropout=0.1, mem_len=None, reuse_len=None, bi_data=False, clamp_len=-1, same_length=False, summary_type="last", summary_use_proj=True, summary_activation="tanh", summary_last_dropout=0.1, start_n_top=5, end_n_top=5, pad_token_id=5, bos_token_id=1, eos_token_id=2, **kwargs ): """Constructs XLNetConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.d_model = d_model self.n_layer = n_layer self.n_head = n_head assert d_model % n_head == 0 self.d_head = d_model // n_head self.ff_activation = ff_activation self.d_inner = d_inner self.untie_r = untie_r self.attn_type = attn_type self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.dropout = dropout self.mem_len = mem_len self.reuse_len = reuse_len self.bi_data = bi_data self.clamp_len = clamp_len self.same_length = same_length self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_last_dropout = summary_last_dropout self.start_n_top = start_n_top self.end_n_top = end_n_top self.bos_token_id = bos_token_id self.pad_token_id = pad_token_id self.eos_token_id = eos_token_id @property def max_position_embeddings(self): return -1 @property def n_token(self): # Backward compatibility return self.vocab_size @n_token.setter def n_token(self, value): # Backward compatibility self.vocab_size = value @property def hidden_size(self): return self.d_model @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_albert_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert ALBERT checkpoint.""" import argparse import logging import torch from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_albert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--albert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained ALBERT model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_bart_original_pytorch_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert BART checkpoint.""" import argparse import logging import os from pathlib import Path import fairseq import torch from packaging import version from transformers import ( BartConfig, BartForConditionalGeneration, BartForSequenceClassification, BartModel, BartTokenizer, ) from transformers.modeling_bart import _make_linear_from_emb FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"] extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification} if version.parse(fairseq.__version__) < version.parse("0.9.0"): raise Exception("requires fairseq >= 0.9.0") logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) SAMPLE_TEXT = " Hello world! cécé herlolip" mnli_rename_keys = [ ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), ] def remove_ignore_keys_(state_dict): ignore_keys = [ "encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor", ] for k in ignore_keys: state_dict.pop(k, None) def rename_key(dct, old, new): val = dct.pop(old) dct[new] = val def load_xsum_checkpoint(checkpoint_path): """Checkpoint path should end in model.pt""" sd = torch.load(checkpoint_path, map_location="cpu") hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval() hub_interface.model.load_state_dict(sd["model"]) return hub_interface def convert_checkpoint_from_disk(checkpoint_path, **config_kwargs): state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] remove_ignore_keys_(state_dict) vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] mbart_config = BartConfig(vocab_size=vocab_size, **config_kwargs) model = BartForConditionalGeneration(mbart_config) model.model.load_state_dict(state_dict) if hasattr(model, "lm_head"): model.lm_head = _make_linear_from_emb(model.model.shared) return model @torch.no_grad() def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): """ Copy/paste/tweak model's weights to our BERT structure. """ if not os.path.exists(checkpoint_path): bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() else: bart = load_xsum_checkpoint(checkpoint_path) bart.model.upgrade_state_dict(bart.model.state_dict()) if hf_checkpoint_name is None: hf_checkpoint_name = checkpoint_path.replace(".", "-") config = BartConfig.from_pretrained(hf_checkpoint_name) tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) assert torch.eq(tokens, tokens2).all() if checkpoint_path == "bart.large.mnli": state_dict = bart.state_dict() remove_ignore_keys_(state_dict) state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] for src, dest in mnli_rename_keys: rename_key(state_dict, src, dest) model = BartForSequenceClassification(config).eval() model.load_state_dict(state_dict) fairseq_output = bart.predict("mnli", tokens, return_logits=True) new_model_outputs = model(tokens)[0] # logits else: # no classification heads to worry about state_dict = bart.model.state_dict() remove_ignore_keys_(state_dict) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] fairseq_output = bart.extract_features(tokens) if hf_checkpoint_name == "facebook/bart-large": model = BartModel(config).eval() model.load_state_dict(state_dict) new_model_outputs = model(tokens).model[0] else: model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt model.model.load_state_dict(state_dict) if hasattr(model, "lm_head"): model.lm_head = _make_linear_from_emb(model.model.shared) new_model_outputs = model.model(tokens)[0] # Check results assert fairseq_output.shape == new_model_outputs.shape assert (fairseq_output == new_model_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." ) parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") parser.add_argument( "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum" ) args = parser.parse_args() convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_bert_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert BERT checkpoint.""" import argparse import logging import torch from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_bert_pytorch_checkpoint_to_original_tf.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" import argparse import os import numpy as np import tensorflow as tf import torch from transformers import BertModel def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): """ :param model:BertModel Pytorch model instance to be converted :param ckpt_dir: Tensorflow model directory :param model_name: model name :return: Currently supported HF models: Y BertModel N BertForMaskedLM N BertForPreTraining N BertForMultipleChoice N BertForNextSentencePrediction N BertForSequenceClassification N BertForQuestionAnswering """ tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") var_map = ( ("layer.", "layer_"), ("word_embeddings.weight", "word_embeddings"), ("position_embeddings.weight", "position_embeddings"), ("token_type_embeddings.weight", "token_type_embeddings"), (".", "/"), ("LayerNorm/weight", "LayerNorm/gamma"), ("LayerNorm/bias", "LayerNorm/beta"), ("weight", "kernel"), ) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) state_dict = model.state_dict() def to_tf_var_name(name: str): for patt, repl in iter(var_map): name = name.replace(patt, repl) return "bert/{}".format(name) def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) session.run(tf_var) return tf_var tf.reset_default_graph() with tf.Session() as session: for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() if any([x in var_name for x in tensors_to_transpose]): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf.keras.backend.set_value(tf_var, torch_tensor) tf_weight = session.run(tf_var) print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) saver = tf.train.Saver(tf.trainable_variables()) saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) def main(raw_args=None): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") parser.add_argument( "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" ) parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") args = parser.parse_args(raw_args) model = BertModel.from_pretrained( pretrained_model_name_or_path=args.model_name, state_dict=torch.load(args.pytorch_model_path), cache_dir=args.cache_dir, ) convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) if __name__ == "__main__": main() ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py ================================================ import argparse import os import torch from transformers.file_utils import WEIGHTS_NAME DIALOGPT_MODELS = ["small", "medium", "large"] OLD_KEY = "lm_head.decoder.weight" NEW_KEY = "lm_head.weight" def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): d = torch.load(checkpoint_path) d[NEW_KEY] = d.pop(OLD_KEY) os.makedirs(pytorch_dump_folder_path, exist_ok=True) torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dialogpt_path", default=".", type=str) args = parser.parse_args() for MODEL in DIALOGPT_MODELS: checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" convert_dialogpt_checkpoint( checkpoint_path, pytorch_dump_folder_path, ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_electra_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert ELECTRA checkpoint.""" import argparse import logging import torch from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): # Initialise PyTorch model config = ElectraConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) if discriminator_or_generator == "discriminator": model = ElectraForPreTraining(config) elif discriminator_or_generator == "generator": model = ElectraForMaskedLM(config) else: raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") # Load weights from tf checkpoint load_tf_weights_in_electra( model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator ) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--discriminator_or_generator", default=None, type=str, required=True, help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " "'generator'.", ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch( args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_gpt2_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert OpenAI GPT checkpoint.""" import argparse import logging import torch from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 logging.basicConfig(level=logging.INFO) def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): # Construct model if gpt2_config_file == "": config = GPT2Config() else: config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--gpt2_config_file", default="", type=str, help="An optional config json file corresponding to the pre-trained OpenAI model. \n" "This specifies the model architecture.", ) args = parser.parse_args() convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_graph_to_onnx.py ================================================ from argparse import ArgumentParser from os import listdir, makedirs from os.path import abspath, dirname, exists from typing import Dict, List, Optional, Tuple from transformers import is_tf_available, is_torch_available from transformers.pipelines import Pipeline, pipeline from transformers.tokenization_utils import BatchEncoding class OnnxConverterArgumentParser(ArgumentParser): """ Wraps all the script arguments supported to export transformers1 models to ONNX IR """ def __init__(self): super(OnnxConverterArgumentParser, self).__init__("ONNX Converter") self.add_argument("--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)") self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") self.add_argument("--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model") self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") self.add_argument("--check-loading", action="store_true", help="Check ONNX is able to load the model") self.add_argument("--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb") self.add_argument("output") def ensure_valid_input(model, tokens, input_names): """ Ensure input are presented in the correct order, without any None Args: model: The model used to forward the input data tokens: BatchEncoding holding the input data input_names: The name of the inputs Returns: Tuple """ model_args_name = model.forward.__code__.co_varnames ordered_input_names = [] model_args = [] for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument if arg_name in input_names: ordered_input_names.append(arg_name) model_args.append(tokens[arg_name]) else: break return ordered_input_names, tuple(model_args) def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: def build_shape_dict(tensor, is_input: bool, seq_len: int): if isinstance(tensor, (tuple, list)): return [build_shape_dict(t, is_input, seq_len) for t in tensor] else: # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...) axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} if is_input: if len(tensor.shape) == 2: axes[1] = "sequence" else: raise ValueError("Unable to infer tensor axes ({})".format(len(tensor.shape))) else: seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] axes.update({dim: "sequence" for dim in seq_axes}) return axes tokens = nlp.tokenizer.encode_plus("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) if not isinstance(outputs, (list, tuple)): outputs = (outputs,) # Generate input names & axes input_vars = list(tokens.keys()) input_dynamic_axes = {k: build_shape_dict(v, True, seq_len) for k, v in tokens.items()} # flatten potentially grouped outputs (past for gpt2, attentions) outputs_flat = [] for output in outputs: if isinstance(output, (tuple, list)): outputs_flat.extend(output) else: outputs_flat.append(output) # Generate output names & axes output_names = ["output_{}".format(i) for i in range(len(outputs_flat))] output_dynamic_axes = {k: build_shape_dict(v, False, seq_len) for k, v in zip(output_names, outputs_flat)} # Create the aggregated axes representation dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) return input_vars, output_names, dynamic_axes, tokens def load_graph_from_args(framework: str, model: str, tokenizer: Optional[str] = None) -> Pipeline: # If no tokenizer provided if tokenizer is None: tokenizer = model print("Loading pipeline (model: {}, tokenizer: {})".format(model, tokenizer)) # Allocate tokenizer and model return pipeline("feature-extraction", model=model, tokenizer=tokenizer, framework=framework) def convert_pytorch(nlp: Pipeline, opset: int, output: str, use_external_format: bool): if not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export print("PyTorch: {}".format(torch.__version__)) with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) export( nlp.model, model_args, f=output, input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=True, use_external_data_format=use_external_format, enable_onnx_checker=True, opset_version=opset, ) def convert_tensorflow(nlp: Pipeline, opset: int, output: str): if not is_tf_available(): raise Exception( "Cannot convert {} because TF is not installed. Please install torch first.".format(args.model) ) print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") try: import tensorflow as tf from keras2onnx import convert_keras, save_model, __version__ as k2ov print("TensorFlow: {}, keras2onnx: {}".format(tf.version.VERSION, k2ov)) # Build input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) save_model(onnx_model, output) except ImportError as e: raise Exception( "Cannot import {} required to convert TF model to ONNX. Please install {} first.".format(e.name, e.name) ) def convert( framework: str, model: str, output: str, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, ): print("ONNX opset version set to: {}".format(opset)) # Load the pipeline nlp = load_graph_from_args(framework, model, tokenizer) parent = dirname(output) if not exists(parent): print("Creating folder {}".format(parent)) makedirs(parent) elif len(listdir(parent)) > 0: raise Exception("Folder {} is not empty, aborting conversion".format(parent)) # Export the graph if framework == "pt": convert_pytorch(nlp, opset, output, use_external_format) else: convert_tensorflow(nlp, opset, output) def verify(path: str): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException print("Checking ONNX model loading from: {}".format(path)) try: onnx_options = SessionOptions() _ = InferenceSession(path, onnx_options, providers=["CPUExecutionProvider"]) print("Model correctly loaded") except RuntimeException as re: print("Error while loading the model: {}".format(re)) if __name__ == "__main__": parser = OnnxConverterArgumentParser() args = parser.parse_args() # Make sure output is absolute path args.output = abspath(args.output) try: # Convert convert(args.framework, args.model, args.output, args.opset, args.tokenizer, args.use_external_format) # And verify if args.check_loading: verify(args.output) except Exception as e: print("Error while converting the model: {}".format(e)) exit(1) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_longformer_original_pytorch_lightning_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert RoBERTa checkpoint.""" import argparse import pytorch_lightning as pl import torch from transformers.modeling_longformer import LongformerForQuestionAnswering, LongformerModel class LightningModel(pl.LightningModule): def __init__(self, model): super().__init__() self.model = model self.num_labels = 2 self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) # implement only because lighning requires to do so def forward(self): pass def convert_longformer_qa_checkpoint_to_pytorch( longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str ): # load longformer model from model identifier longformer = LongformerModel.from_pretrained(longformer_model) lightning_model = LightningModel(longformer) ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu")) lightning_model.load_state_dict(ckpt["state_dict"]) # init longformer question answering model longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model) # transfer weights longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict()) longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict()) longformer_for_qa.eval() # save model longformer_for_qa.save_pretrained(pytorch_dump_folder_path) print("Conversion succesful. Model saved under {}".format(pytorch_dump_folder_path)) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--longformer_model", default=None, type=str, required=True, help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.", ) parser.add_argument( "--longformer_question_answering_ckpt_path", default=None, type=str, required=True, help="Path the official PyTorch Lighning Checkpoint.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_longformer_qa_checkpoint_to_pytorch( args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_marian_to_pytorch.py ================================================ import argparse import json import os import shutil import warnings from pathlib import Path from typing import Dict, List, Union from zipfile import ZipFile import numpy as np import torch from tqdm import tqdm from transformers import MarianConfig, MarianMTModel, MarianTokenizer from transformers.hf_api import HfApi def remove_prefix(text: str, prefix: str): if text.startswith(prefix): return text[len(prefix) :] return text # or whatever def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict): sd = {} for k in opus_dict: if not k.startswith(layer_prefix): continue stripped = remove_prefix(k, layer_prefix) v = opus_dict[k].T # besides embeddings, everything must be transposed. sd[converter[stripped]] = torch.tensor(v).squeeze() return sd def load_layers_(layer_lst: torch.nn.ModuleList, opus_state: dict, converter, is_decoder=False): for i, layer in enumerate(layer_lst): layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_" sd = convert_encoder_layer(opus_state, layer_tag, converter) layer.load_state_dict(sd, strict=True) def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]: """Find models that can accept src_lang as input and return tgt_lang as output.""" prefix = "Helsinki-NLP/opus-mt-" api = HfApi() model_list = api.model_list() model_ids = [x.modelId for x in model_list if x.modelId.startswith("Helsinki-NLP")] src_and_targ = [ remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m ] # + cant be loaded. matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b] return matching def add_emb_entries(wemb, final_bias, n_special_tokens=1): vsize, d_model = wemb.shape embs_to_add = np.zeros((n_special_tokens, d_model)) new_embs = np.concatenate([wemb, embs_to_add]) bias_to_add = np.zeros((n_special_tokens, 1)) new_bias = np.concatenate((final_bias, bias_to_add), axis=1) return new_embs, new_bias def _cast_yaml_str(v): bool_dct = {"true": True, "false": False} if not isinstance(v, str): return v elif v in bool_dct: return bool_dct[v] try: return int(v) except (TypeError, ValueError): return v def cast_marian_config(raw_cfg: Dict[str, str]) -> Dict: return {k: _cast_yaml_str(v) for k, v in raw_cfg.items()} CONFIG_KEY = "special:model.yml" def load_config_from_state_dict(opus_dict): import yaml cfg_str = "".join([chr(x) for x in opus_dict[CONFIG_KEY]]) yaml_cfg = yaml.load(cfg_str[:-1], Loader=yaml.BaseLoader) return cast_marian_config(yaml_cfg) def find_model_file(dest_dir): # this one better model_files = list(Path(dest_dir).glob("*.npz")) assert len(model_files) == 1, model_files model_file = model_files[0] return model_file # Group Names Logic: change long opus model names to something shorter, like opus-mt-en-ROMANCE ROM_GROUP = "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la" GROUPS = [ ("cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "ZH"), (ROM_GROUP, "ROMANCE"), ("de+nl+fy+af+da+fo+is+no+nb+nn+sv", "NORTH_EU"), ("da+fo+is+no+nb+nn+sv", "SCANDINAVIA"), ("se+sma+smj+smn+sms", "SAMI"), ("nb_NO+nb+nn_NO+nn+nog+no_nb+no", "NORWAY"), ("ga+cy+br+gd+kw+gv", "CELTIC"), # https://en.wikipedia.org/wiki/Insular_Celtic_languages ] GROUP_TO_OPUS_NAME = { "opus-mt-ZH-de": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-de", "opus-mt-ZH-fi": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi", "opus-mt-ZH-sv": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-sv", "opus-mt-SCANDINAVIA-SCANDINAVIA": "da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+sv", "opus-mt-NORTH_EU-NORTH_EU": "de+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+sv", "opus-mt-de-ZH": "de-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "opus-mt-en_el_es_fi-en_el_es_fi": "en+el+es+fi-en+el+es+fi", "opus-mt-en-ROMANCE": "en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO" "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR" "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la", "opus-mt-en-CELTIC": "en-ga+cy+br+gd+kw+gv", "opus-mt-es-NORWAY": "es-nb_NO+nb+nn_NO+nn+nog+no_nb+no", "opus-mt-fi_nb_no_nn_ru_sv_en-SAMI": "fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms", "opus-mt-fi-ZH": "fi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "opus-mt-fi-NORWAY": "fi-nb_NO+nb+nn_NO+nn+nog+no_nb+no", "opus-mt-ROMANCE-en": "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO" "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR" "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en", "opus-mt-CELTIC-en": "ga+cy+br+gd+kw+gv-en", "opus-mt-sv-ZH": "sv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "opus-mt-sv-NORWAY": "sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no", } OPUS_GITHUB_URL = "https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/" ORG_NAME = "Helsinki-NLP/" def convert_opus_name_to_hf_name(x): for substr, grp_name in GROUPS: x = x.replace(substr, grp_name) return x.replace("+", "_") def convert_hf_name_to_opus_name(hf_model_name): """Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.""" hf_model_name = remove_prefix(hf_model_name, ORG_NAME) if hf_model_name in GROUP_TO_OPUS_NAME: opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name] else: opus_w_prefix = hf_model_name.replace("_", "+") return remove_prefix(opus_w_prefix, "opus-mt-") def write_model_card( hf_model_name: str, repo_path="OPUS-MT-train/models/", dry_run=False, model_card_dir=Path("marian_converted/model_cards/Helsinki-NLP/"), ) -> str: """Copy the most recent model's readme section from opus, and add metadata. upload command: s3cmd sync --recursive model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ """ hf_model_name = remove_prefix(hf_model_name, ORG_NAME) opus_name: str = convert_hf_name_to_opus_name(hf_model_name) opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")] readme_url = OPUS_GITHUB_URL + f"{opus_name}/README.md" s, t = ",".join(opus_src), ",".join(opus_tgt) extra_markdown = f"### {hf_model_name}\n\n* source languages: {s}\n* target languages: {t}\n* OPUS readme: [{opus_name}]({readme_url})\n" # combine with opus markdown opus_readme_path = Path(f"{repo_path}{opus_name}/README.md") assert opus_readme_path.exists(), opus_readme_path content = opus_readme_path.open().read() content = content.split("\n# ")[-1] # Get the lowest level 1 header in the README -- the most recent model. content = "*".join(content.split("*")[1:]) content = extra_markdown + "\n* " + content.replace("download", "download original weights") if dry_run: return content # Save string to model_cards/hf_model_name/readme.md model_card_dir.mkdir(exist_ok=True) sub_dir = model_card_dir / hf_model_name sub_dir.mkdir(exist_ok=True) dest = sub_dir / "README.md" dest.open("w").write(content) return content def get_clean_model_id_mapping(multiling_model_ids): return {x: convert_opus_name_to_hf_name(x) for x in multiling_model_ids} def make_registry(repo_path="Opus-MT-train/models"): if not (Path(repo_path) / "fr-en" / "README.md").exists(): raise ValueError( f"repo_path:{repo_path} does not exist: " "You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling." ) results = {} for p in Path(repo_path).ls(): n_dash = p.name.count("-") if n_dash == 0: continue else: lns = list(open(p / "README.md").readlines()) results[p.name] = _parse_readme(lns) return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()] def convert_all_sentencepiece_models(model_list=None, repo_path=None): """Requires 300GB""" save_dir = Path("marian_ckpt") dest_dir = Path("marian_converted") dest_dir.mkdir(exist_ok=True) if model_list is None: model_list: list = make_registry(repo_path=repo_path) for k, prepro, download, test_set_url in tqdm(model_list): if "SentencePiece" not in prepro: # dont convert BPE models. continue if not os.path.exists(save_dir / k / "pytorch_model.bin"): download_and_unzip(download, save_dir / k) pair_name = convert_opus_name_to_hf_name(k) convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}") def lmap(f, x) -> List: return list(map(f, x)) def fetch_test_set(test_set_url): import wget fname = wget.download(test_set_url, "opus_test.txt") lns = Path(fname).open().readlines() src = lmap(str.strip, lns[::4]) gold = lmap(str.strip, lns[1::4]) mar_model = lmap(str.strip, lns[2::4]) assert len(gold) == len(mar_model) == len(src) os.remove(fname) return src, mar_model, gold def convert_whole_dir(path=Path("marian_ckpt/")): for subdir in tqdm(list(path.ls())): dest_dir = f"marian_converted/{subdir.name}" if (dest_dir / "pytorch_model.bin").exists(): continue convert(source_dir, dest_dir) def _parse_readme(lns): """Get link and metadata from opus model card equivalent.""" subres = {} for ln in [x.strip() for x in lns]: if not ln.startswith("*"): continue ln = ln[1:].strip() for k in ["download", "dataset", "models", "model", "pre-processing"]: if ln.startswith(k): break else: continue if k in ["dataset", "model", "pre-processing"]: splat = ln.split(":") _, v = splat subres[k] = v elif k == "download": v = ln.split("(")[-1][:-1] subres[k] = v return subres def save_tokenizer_config(dest_dir: Path): dname = dest_dir.name.split("-") dct = dict(target_lang=dname[-1], source_lang="-".join(dname[:-1])) save_json(dct, dest_dir / "tokenizer_config.json") def add_to_vocab_(vocab: Dict[str, int], special_tokens: List[str]): start = max(vocab.values()) + 1 added = 0 for tok in special_tokens: if tok in vocab: continue vocab[tok] = start + added added += 1 return added def find_vocab_file(model_dir): return list(model_dir.glob("*vocab.yml"))[0] def add_special_tokens_to_vocab(model_dir: Path) -> None: vocab = load_yaml(find_vocab_file(model_dir)) vocab = {k: int(v) for k, v in vocab.items()} num_added = add_to_vocab_(vocab, [""]) print(f"added {num_added} tokens to vocab") save_json(vocab, model_dir / "vocab.json") save_tokenizer_config(model_dir) def save_tokenizer(self, save_directory): dest = Path(save_directory) src_path = Path(self.init_kwargs["source_spm"]) for dest_name in {"source.spm", "target.spm", "tokenizer_config.json"}: shutil.copyfile(src_path.parent / dest_name, dest / dest_name) save_json(self.encoder, dest / "vocab.json") def check_equal(marian_cfg, k1, k2): v1, v2 = marian_cfg[k1], marian_cfg[k2] assert v1 == v2, f"hparams {k1},{k2} differ: {v1} != {v2}" def check_marian_cfg_assumptions(marian_cfg): assumed_settings = { "tied-embeddings-all": True, "layer-normalization": False, "right-left": False, "transformer-ffn-depth": 2, "transformer-aan-depth": 2, "transformer-no-projection": False, "transformer-postprocess-emb": "d", "transformer-postprocess": "dan", # Dropout, add, normalize "transformer-preprocess": "", "type": "transformer", "ulr-dim-emb": 0, "dec-cell-base-depth": 2, "dec-cell-high-depth": 1, "transformer-aan-nogate": False, } for k, v in assumed_settings.items(): actual = marian_cfg[k] assert actual == v, f"Unexpected config value for {k} expected {v} got {actual}" check_equal(marian_cfg, "transformer-ffn-activation", "transformer-aan-activation") check_equal(marian_cfg, "transformer-ffn-depth", "transformer-aan-depth") check_equal(marian_cfg, "transformer-dim-ffn", "transformer-dim-aan") BIAS_KEY = "decoder_ff_logit_out_b" BART_CONVERTER = { # for each encoder and decoder layer "self_Wq": "self_attn.q_proj.weight", "self_Wk": "self_attn.k_proj.weight", "self_Wv": "self_attn.v_proj.weight", "self_Wo": "self_attn.out_proj.weight", "self_bq": "self_attn.q_proj.bias", "self_bk": "self_attn.k_proj.bias", "self_bv": "self_attn.v_proj.bias", "self_bo": "self_attn.out_proj.bias", "self_Wo_ln_scale": "self_attn_layer_norm.weight", "self_Wo_ln_bias": "self_attn_layer_norm.bias", "ffn_W1": "fc1.weight", "ffn_b1": "fc1.bias", "ffn_W2": "fc2.weight", "ffn_b2": "fc2.bias", "ffn_ffn_ln_scale": "final_layer_norm.weight", "ffn_ffn_ln_bias": "final_layer_norm.bias", # Decoder Cross Attention "context_Wk": "encoder_attn.k_proj.weight", "context_Wo": "encoder_attn.out_proj.weight", "context_Wq": "encoder_attn.q_proj.weight", "context_Wv": "encoder_attn.v_proj.weight", "context_bk": "encoder_attn.k_proj.bias", "context_bo": "encoder_attn.out_proj.bias", "context_bq": "encoder_attn.q_proj.bias", "context_bv": "encoder_attn.v_proj.bias", "context_Wo_ln_scale": "encoder_attn_layer_norm.weight", "context_Wo_ln_bias": "encoder_attn_layer_norm.bias", } class OpusState: def __init__(self, source_dir): npz_path = find_model_file(source_dir) self.state_dict = np.load(npz_path) cfg = load_config_from_state_dict(self.state_dict) assert cfg["dim-vocabs"][0] == cfg["dim-vocabs"][1] assert "Wpos" not in self.state_dict self.state_dict = dict(self.state_dict) self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1) self.pad_token_id = self.wemb.shape[0] - 1 cfg["vocab_size"] = self.pad_token_id + 1 # self.state_dict['Wemb'].sha self.state_keys = list(self.state_dict.keys()) if "Wtype" in self.state_dict: raise ValueError("found Wtype key") self._check_layer_entries() self.source_dir = source_dir self.cfg = cfg hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape assert hidden_size == cfg["dim-emb"] == 512 # Process decoder.yml decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml")) check_marian_cfg_assumptions(cfg) self.hf_config = MarianConfig( vocab_size=cfg["vocab_size"], decoder_layers=cfg["dec-depth"], encoder_layers=cfg["enc-depth"], decoder_attention_heads=cfg["transformer-heads"], encoder_attention_heads=cfg["transformer-heads"], decoder_ffn_dim=cfg["transformer-dim-ffn"], encoder_ffn_dim=cfg["transformer-dim-ffn"], d_model=cfg["dim-emb"], activation_function=cfg["transformer-aan-activation"], pad_token_id=self.pad_token_id, eos_token_id=0, bos_token_id=0, max_position_embeddings=cfg["dim-emb"], scale_embedding=True, normalize_embedding="n" in cfg["transformer-preprocess"], static_position_embeddings=not cfg["transformer-train-position-embeddings"], dropout=0.1, # see opus-mt-train repo/transformer-dropout param. # default: add_final_layer_norm=False, num_beams=decoder_yml["beam-size"], decoder_start_token_id=self.pad_token_id, bad_words_ids=[[self.pad_token_id]], max_length=512, ) def _check_layer_entries(self): self.encoder_l1 = self.sub_keys("encoder_l1") self.decoder_l1 = self.sub_keys("decoder_l1") self.decoder_l2 = self.sub_keys("decoder_l2") if len(self.encoder_l1) != 16: warnings.warn(f"Expected 16 keys for each encoder layer, got {len(self.encoder_l1)}") if len(self.decoder_l1) != 26: warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}") if len(self.decoder_l2) != 26: warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}") @property def extra_keys(self): extra = [] for k in self.state_keys: if ( k.startswith("encoder_l") or k.startswith("decoder_l") or k in [CONFIG_KEY, "Wemb", "Wpos", "decoder_ff_logit_out_b"] ): continue else: extra.append(k) return extra def sub_keys(self, layer_prefix): return [remove_prefix(k, layer_prefix) for k in self.state_dict if k.startswith(layer_prefix)] def load_marian_model(self) -> MarianMTModel: state_dict, cfg = self.state_dict, self.hf_config assert cfg.static_position_embeddings model = MarianMTModel(cfg) assert "hidden_size" not in cfg.to_dict() load_layers_( model.model.encoder.layers, state_dict, BART_CONVERTER, ) load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True) # handle tensors not associated with layers wemb_tensor = torch.nn.Parameter(torch.FloatTensor(self.wemb)) bias_tensor = torch.nn.Parameter(torch.FloatTensor(self.final_bias)) model.model.shared.weight = wemb_tensor model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared model.final_logits_bias = bias_tensor if "Wpos" in state_dict: print("Unexpected: got Wpos") wpos_tensor = torch.tensor(state_dict["Wpos"]) model.model.encoder.embed_positions.weight = wpos_tensor model.model.decoder.embed_positions.weight = wpos_tensor if cfg.normalize_embedding: assert "encoder_emb_ln_scale_pre" in state_dict raise NotImplementedError("Need to convert layernorm_embedding") assert not self.extra_keys, f"Failed to convert {self.extra_keys}" assert model.model.shared.padding_idx == self.pad_token_id return model def download_and_unzip(url, dest_dir): try: import wget except ImportError: raise ImportError("you must pip install wget") filename = wget.download(url) unzip(filename, dest_dir) os.remove(filename) def convert(source_dir: Path, dest_dir): dest_dir = Path(dest_dir) dest_dir.mkdir(exist_ok=True) add_special_tokens_to_vocab(source_dir) tokenizer = MarianTokenizer.from_pretrained(str(source_dir)) save_tokenizer(tokenizer, dest_dir) opus_state = OpusState(source_dir) assert opus_state.cfg["vocab_size"] == len(tokenizer.encoder) # save_json(opus_state.cfg, dest_dir / "marian_original_config.json") # ^^ Save human readable marian config for debugging model = opus_state.load_marian_model() model.save_pretrained(dest_dir) model.from_pretrained(dest_dir) # sanity check if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--src", type=str, help="path to marian model dir", default="en-de") parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.") args = parser.parse_args() source_dir = Path(args.src) assert source_dir.exists() dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest convert(source_dir, dest_dir) def load_yaml(path): import yaml with open(path) as f: return yaml.load(f, Loader=yaml.BaseLoader) def save_json(content: Union[Dict, List], path: str) -> None: with open(path, "w") as f: json.dump(content, f) def unzip(zip_path: str, dest_dir: str) -> None: with ZipFile(zip_path, "r") as zipObj: zipObj.extractall(dest_dir) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_openai_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert OpenAI GPT checkpoint.""" import argparse import logging import torch from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt logging.basicConfig(level=logging.INFO) def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): # Construct model if openai_config_file == "": config = OpenAIGPTConfig() else: config = OpenAIGPTConfig.from_json_file(openai_config_file) model = OpenAIGPTModel(config) # Load weights from numpy load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--openai_checkpoint_folder_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--openai_config_file", default="", type=str, help="An optional config json file corresponding to the pre-trained OpenAI model. \n" "This specifies the model architecture.", ) args = parser.parse_args() convert_openai_checkpoint_to_pytorch( args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_pytorch_checkpoint_to_tf2.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Convert pytorch checkpoints to TensorFlow """ import argparse import logging import os from transformers import ( ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, WEIGHTS_NAME, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig, BertConfig, CamembertConfig, CTRLConfig, DistilBertConfig, ElectraConfig, FlaubertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, T5Config, TFAlbertForPreTraining, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, TFCamembertForMaskedLM, TFCTRLLMHeadModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFElectraForPreTraining, TFFlaubertWithLMHeadModel, TFGPT2LMHeadModel, TFOpenAIGPTLMHeadModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFT5ForConditionalGeneration, TFTransfoXLLMHeadModel, TFXLMRobertaForMaskedLM, TFXLMWithLMHeadModel, TFXLNetLMHeadModel, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLNetConfig, cached_path, hf_bucket_url, is_torch_available, load_pytorch_checkpoint_in_tf2_model, ) if is_torch_available(): import torch import numpy as np from transformers import ( BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, GPT2LMHeadModel, XLNetLMHeadModel, XLMWithLMHeadModel, XLMRobertaForMaskedLM, TransfoXLLMHeadModel, OpenAIGPTLMHeadModel, RobertaForMaskedLM, RobertaForSequenceClassification, CamembertForMaskedLM, FlaubertWithLMHeadModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, CTRLLMHeadModel, AlbertForPreTraining, T5ForConditionalGeneration, ElectraForPreTraining, ) logging.basicConfig(level=logging.INFO) MODEL_CLASSES = { "bert": (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,), "bert-large-uncased-whole-word-masking-finetuned-squad": ( BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "bert-large-cased-whole-word-masking-finetuned-squad": ( BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "bert-base-cased-finetuned-mrpc": ( BertConfig, TFBertForSequenceClassification, BertForSequenceClassification, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "gpt2": (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,), "xlnet": (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,), "xlm": (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,), "xlm-roberta": ( XLMRobertaConfig, TFXLMRobertaForMaskedLM, XLMRobertaForMaskedLM, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "transfo-xl": ( TransfoXLConfig, TFTransfoXLLMHeadModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "openai-gpt": ( OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "roberta": (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,), "roberta-large-mnli": ( RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "camembert": ( CamembertConfig, TFCamembertForMaskedLM, CamembertForMaskedLM, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "flaubert": ( FlaubertConfig, TFFlaubertWithLMHeadModel, FlaubertWithLMHeadModel, FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "distilbert": ( DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "distilbert-base-distilled-squad": ( DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "ctrl": (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,), "albert": (AlbertConfig, TFAlbertForPreTraining, AlbertForPreTraining, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,), "t5": (T5Config, TFT5ForConditionalGeneration, T5ForConditionalGeneration, T5_PRETRAINED_CONFIG_ARCHIVE_MAP,), "electra": (ElectraConfig, TFElectraForPreTraining, ElectraForPreTraining, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,), } def convert_pt_checkpoint_to_tf( model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True ): if model_type not in MODEL_CLASSES: raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys()))) config_class, model_class, pt_model_class, aws_config_map = MODEL_CLASSES[model_type] # Initialise TF model if config_file in aws_config_map: config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models) config = config_class.from_json_file(config_file) config.output_hidden_states = True config.output_attentions = True print("Building TensorFlow model from configuration: {}".format(str(config))) tf_model = model_class(config) # Load weights from tf checkpoint if pytorch_checkpoint_path in aws_config_map.keys(): pytorch_checkpoint_url = hf_bucket_url(pytorch_checkpoint_path, filename=WEIGHTS_NAME) pytorch_checkpoint_path = cached_path(pytorch_checkpoint_url, force_download=not use_cached_models) # Load PyTorch checkpoint in tf2 model: tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) if compare_with_pt_model: tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu") pt_model = pt_model_class.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=state_dict ) with torch.no_grad(): pto = pt_model(**pt_model.dummy_inputs) np_pt = pto[0].numpy() np_tf = tfo[0].numpy() diff = np.amax(np.abs(np_pt - np_tf)) print("Max absolute difference between models outputs {}".format(diff)) assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff) # Save pytorch-model print("Save TensorFlow model to {}".format(tf_dump_path)) tf_model.save_weights(tf_dump_path, save_format="h5") def convert_all_pt_checkpoints_to_tf( args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None, compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False, ): assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory" if args_model_type is None: model_types = list(MODEL_CLASSES.keys()) else: model_types = [args_model_type] for j, model_type in enumerate(model_types, start=1): print("=" * 100) print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type)) print("=" * 100) if model_type not in MODEL_CLASSES: raise ValueError( "Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())) ) config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type] if model_shortcut_names_or_path is None: model_shortcut_names_or_path = list(aws_model_maps.keys()) if config_shortcut_names_or_path is None: config_shortcut_names_or_path = model_shortcut_names_or_path for i, (model_shortcut_name, config_shortcut_name) in enumerate( zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1 ): print("-" * 100) if "-squad" in model_shortcut_name or "-mrpc" in model_shortcut_name or "-mnli" in model_shortcut_name: if not only_convert_finetuned_models: print(" Skipping finetuned checkpoint {}".format(model_shortcut_name)) continue model_type = model_shortcut_name elif only_convert_finetuned_models: print(" Skipping not finetuned checkpoint {}".format(model_shortcut_name)) continue print( " Converting checkpoint {}/{}: {} - model_type {}".format( i, len(aws_config_map), model_shortcut_name, model_type ) ) print("-" * 100) if config_shortcut_name in aws_config_map: config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models) else: config_file = cached_path(config_shortcut_name, force_download=not use_cached_models) if model_shortcut_name in aws_model_maps: model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models) else: model_file = cached_path(model_shortcut_name, force_download=not use_cached_models) if os.path.isfile(model_shortcut_name): model_shortcut_name = "converted_model" convert_pt_checkpoint_to_tf( model_type=model_type, pytorch_checkpoint_path=model_file, config_file=config_file, tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + "-tf_model.h5"), compare_with_pt_model=compare_with_pt_model, ) if remove_cached_files: os.remove(config_file) os.remove(model_file) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file." ) parser.add_argument( "--model_type", default=None, type=str, help="Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format( list(MODEL_CLASSES.keys()) ), ) parser.add_argument( "--pytorch_checkpoint_path", default=None, type=str, help="Path to the PyTorch checkpoint path or shortcut name to download from AWS. " "If not given, will download and convert all the checkpoints from AWS.", ) parser.add_argument( "--config_file", default=None, type=str, help="The config json file corresponding to the pre-trained model. \n" "This specifies the model architecture. If not given and " "--pytorch_checkpoint_path is not given or is a shortcut name" "use the configuration associated to the shortcut name on the AWS", ) parser.add_argument( "--compare_with_pt_model", action="store_true", help="Compare Tensorflow and PyTorch model predictions." ) parser.add_argument( "--use_cached_models", action="store_true", help="Use cached models if possible instead of updating to latest checkpoint versions.", ) parser.add_argument( "--remove_cached_files", action="store_true", help="Remove pytorch models after conversion (save memory when converting in batches).", ) parser.add_argument("--only_convert_finetuned_models", action="store_true", help="Only convert finetuned models.") args = parser.parse_args() # if args.pytorch_checkpoint_path is not None: # convert_pt_checkpoint_to_tf(args.model_type.lower(), # args.pytorch_checkpoint_path, # args.config_file if args.config_file is not None else args.pytorch_checkpoint_path, # args.tf_dump_path, # compare_with_pt_model=args.compare_with_pt_model, # use_cached_models=args.use_cached_models) # else: convert_all_pt_checkpoints_to_tf( args.model_type.lower() if args.model_type is not None else None, args.tf_dump_path, model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None, config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None, compare_with_pt_model=args.compare_with_pt_model, use_cached_models=args.use_cached_models, remove_cached_files=args.remove_cached_files, only_convert_finetuned_models=args.only_convert_finetuned_models, ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_reformer_trax_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert Reformer checkpoint.""" import argparse import logging import pickle import numpy as np import torch from transformers import ReformerConfig, ReformerModelWithLMHead logging.basicConfig(level=logging.INFO) def set_param(torch_layer, weight, bias=None): # set parameter of one layer assert torch_layer.weight.shape == weight.shape, "{} layer.weight does not match".format(torch_layer) torch_layer.weight = torch.nn.Parameter(weight) if bias is not None: assert torch_layer.bias.shape == bias.shape, "{} layer.bias does not match".format(torch_layer) torch_layer.bias = torch.nn.Parameter(bias) def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size): # set torch weights for 1-to-1 comparison np_query_key = np.asarray(weights[0]) np_value = np.asarray(weights[1]) np_dense = np.asarray(weights[2]) set_param( torch_layer.self_attention.query_key, torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1), ) def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size): # set torch weights for 1-to-1 comparison np_query = np.asarray(weights[0]) np_key = np.asarray(weights[1]) np_value = np.asarray(weights[2]) np_dense = np.asarray(weights[3]) set_param( torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1), ) def set_block_weights_in_torch(weights, torch_block, hidden_size): # layernorm 1 layer_norm_1 = weights[0][0][0] layer_norm_1_weight = np.asarray(layer_norm_1[0]) layer_norm_1_bias = np.asarray(layer_norm_1[1]) set_param( torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias), ) # lsh weights + output attn_weights = weights[0][1] if len(attn_weights) < 4: set_layer_weights_in_torch_lsh(attn_weights, torch_block.attention, hidden_size) else: set_layer_weights_in_torch_local(attn_weights, torch_block.attention, hidden_size) # intermediate weighs intermediate_weights = weights[2][0][1][2] # Chunked Feed Forward if len(intermediate_weights) == 4: intermediate_weights = intermediate_weights[2] # layernorm 2 layer_norm_2_weight = np.asarray(intermediate_weights[0][0]) layer_norm_2_bias = np.asarray(intermediate_weights[0][1]) set_param( torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias), ) # intermediate dense inter_dense_weight = np.asarray(intermediate_weights[1][0]) inter_dense_bias = np.asarray(intermediate_weights[1][1]) set_param( torch_block.feed_forward.dense.dense, torch.tensor(inter_dense_weight).transpose(0, 1).contiguous(), torch.tensor(inter_dense_bias), ) # intermediate out out_dense_weight = np.asarray(intermediate_weights[4][0]) out_dense_bias = np.asarray(intermediate_weights[4][1]) set_param( torch_block.feed_forward.output.dense, torch.tensor(out_dense_weight).transpose(0, 1).contiguous(), torch.tensor(out_dense_bias), ) def set_model_weights_in_torch(weights, torch_model, hidden_size): # reformer model torch_model_reformer = torch_model.reformer # word embeds word_embeddings = np.asarray(weights[1]) set_param( torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings), ) if isinstance(weights[3], tuple): position_embeddings = torch_model_reformer.embeddings.position_embeddings for emb_idx in range(len(position_embeddings.weights)): emb_weights = np.asarray(weights[3][emb_idx][0]) assert position_embeddings.weights[emb_idx].shape == emb_weights.shape, "{} emb does not match".format( position_embeddings[emb_idx] ) position_embeddings.weights[emb_idx] = torch.nn.Parameter(torch.tensor(emb_weights)) trax_layer_weights = weights[5] assert len(torch_model_reformer.encoder.layers) * 4 == len( trax_layer_weights ), "HF and trax model do not have the same number of layers" for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers): block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)] set_block_weights_in_torch(block_weights, layer, hidden_size) # output layer norm layer_norm_out_weight = np.asarray(weights[7][0]) layer_norm_out_bias = np.asarray(weights[7][1]) set_param( torch_model_reformer.encoder.layer_norm, torch.tensor(layer_norm_out_weight), torch.tensor(layer_norm_out_bias), ) # output embeddings output_embed_weights = np.asarray(weights[9][0]) output_embed_bias = np.asarray(weights[9][1]) set_param( torch_model.lm_head.decoder, torch.tensor(output_embed_weights).transpose(0, 1).contiguous(), torch.tensor(output_embed_bias), ) def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = ReformerConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = ReformerModelWithLMHead(config) with open(trax_model_pkl_path, "rb") as f: model_weights = pickle.load(f)["weights"] set_model_weights_in_torch(model_weights, model, config.hidden_size) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--trax_model_pkl_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained Reformer model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_trax_checkpoint_to_pytorch(args.trax_model_pkl_path, args.config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_roberta_original_pytorch_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert RoBERTa checkpoint.""" import argparse import logging import pathlib import fairseq import torch from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.modules import TransformerSentenceEncoderLayer from packaging import version from transformers.modeling_bert import BertIntermediate, BertLayer, BertOutput, BertSelfAttention, BertSelfOutput from transformers.modeling_roberta import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification if version.parse(fairseq.__version__) < version.parse("0.9.0"): raise Exception("requires fairseq >= 0.9.0") logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) SAMPLE_TEXT = "Hello world! cécé herlolip" def convert_roberta_checkpoint_to_pytorch( roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool ): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout roberta_sent_encoder = roberta.model.decoder.sentence_encoder config = RobertaConfig( vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.args.num_classes print("Our BERT config:", config) model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] # self attention self_attn: BertSelfAttention = layer.attention.self assert ( roberta_layer.self_attn.k_proj.weight.data.shape == roberta_layer.self_attn.q_proj.weight.data.shape == roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size((config.hidden_size, config.hidden_size)) ) self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias # end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.decoder.bias = roberta.model.decoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--classification_head", action="store_true", help="Whether to convert a final classification head." ) args = parser.parse_args() convert_roberta_checkpoint_to_pytorch( args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_t5_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The T5 authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert T5 checkpoint.""" import argparse import logging import torch from transformers import T5Config, T5Model, load_tf_weights_in_t5 logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = T5Config.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = T5Model(config) # Load weights from tf checkpoint load_tf_weights_in_t5(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained T5 model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert Transformer XL checkpoint and datasets.""" import argparse import logging import os import pickle import sys import torch import transformers.tokenization_transfo_xl as data_utils from transformers import ( CONFIG_NAME, WEIGHTS_NAME, TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl, ) from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES logging.basicConfig(level=logging.INFO) # We do this to be able to load python 2 datasets pickles # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 data_utils.Vocab = data_utils.TransfoXLTokenizer data_utils.Corpus = data_utils.TransfoXLCorpus sys.modules["data_utils"] = data_utils sys.modules["vocabulary"] = data_utils def convert_transfo_xl_checkpoint_to_pytorch( tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file ): if transfo_xl_dataset_file: # Convert a pre-processed corpus (see original TensorFlow repo) with open(transfo_xl_dataset_file, "rb") as fp: corpus = pickle.load(fp, encoding="latin1") # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) corpus_vocab_dict = corpus.vocab.__dict__ torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) corpus_dict_no_vocab = corpus.__dict__ corpus_dict_no_vocab.pop("vocab", None) pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME print("Save dataset to {}".format(pytorch_dataset_dump_path)) torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) if tf_checkpoint_path: # Convert a pre-trained TensorFlow model config_path = os.path.abspath(transfo_xl_config_file) tf_path = os.path.abspath(tf_checkpoint_path) print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) # Initialise PyTorch model if transfo_xl_config_file == "": config = TransfoXLConfig() else: config = TransfoXLConfig.from_json_file(transfo_xl_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = TransfoXLLMHeadModel(config) model = load_tf_weights_in_transfo_xl(model, config, tf_path) # Save pytorch-model pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the folder to store the PyTorch model or dataset/vocab.", ) parser.add_argument( "--tf_checkpoint_path", default="", type=str, help="An optional path to a TensorFlow checkpoint path to be converted.", ) parser.add_argument( "--transfo_xl_config_file", default="", type=str, help="An optional config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.", ) parser.add_argument( "--transfo_xl_dataset_file", default="", type=str, help="An optional dataset file to be converted in a vocabulary.", ) args = parser.parse_args() convert_transfo_xl_checkpoint_to_pytorch( args.tf_checkpoint_path, args.transfo_xl_config_file, args.pytorch_dump_folder_path, args.transfo_xl_dataset_file, ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_xlm_original_pytorch_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert OpenAI GPT checkpoint.""" import argparse import json import logging import numpy import torch from transformers import CONFIG_NAME, WEIGHTS_NAME from transformers.tokenization_xlm import VOCAB_FILES_NAMES logging.basicConfig(level=logging.INFO) def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): # Load checkpoint chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") state_dict = chkpt["model"] # We have the base model one level deeper than the original XLM repository two_levels_state_dict = {} for k, v in state_dict.items(): if "pred_layer" in k: two_levels_state_dict[k] = v else: two_levels_state_dict["transformer." + k] = v config = chkpt["params"] config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) vocab = chkpt["dico_word2id"] vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(two_levels_state_dict, pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(json.dumps(config, indent=2) + "\n") print("Save vocab file to {}".format(pytorch_config_dump_path)) with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: f.write(json.dumps(vocab, indent=2) + "\n") if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/convert_xlnet_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert BERT checkpoint.""" import argparse import logging import os import torch from transformers import ( CONFIG_NAME, WEIGHTS_NAME, XLNetConfig, XLNetForQuestionAnswering, XLNetForSequenceClassification, XLNetLMHeadModel, load_tf_weights_in_xlnet, ) GLUE_TASKS_NUM_LABELS = { "cola": 2, "mnli": 3, "mrpc": 2, "sst-2": 2, "sts-b": 1, "qqp": 2, "qnli": 2, "rte": 2, "wnli": 2, } logging.basicConfig(level=logging.INFO) def convert_xlnet_checkpoint_to_pytorch( tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None ): # Initialise PyTorch model config = XLNetConfig.from_json_file(bert_config_file) finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" if finetuning_task in GLUE_TASKS_NUM_LABELS: print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) config.finetuning_task = finetuning_task config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] model = XLNetForSequenceClassification(config) elif "squad" in finetuning_task: config.finetuning_task = finetuning_task model = XLNetForQuestionAnswering(config) else: model = XLNetLMHeadModel(config) # Load weights from tf checkpoint load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) # Save pytorch-model pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--xlnet_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained XLNet model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the folder to store the PyTorch model or dataset/vocab.", ) parser.add_argument( "--finetuning_task", default=None, type=str, help="Name of a task on which the XLNet TensorFloaw model was fine-tuned", ) args = parser.parse_args() print(args) convert_xlnet_checkpoint_to_pytorch( args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from .metrics import is_sklearn_available from .processors import ( DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor, SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels, squad_convert_examples_to_features, xnli_output_modes, xnli_processors, xnli_tasks_num_labels, ) if is_sklearn_available(): from .metrics import glue_compute_metrics, xnli_compute_metrics ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/data_collator.py ================================================ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Dict, List, NewType, Tuple import torch from torch.nn.utils.rnn import pad_sequence import random import numpy as np from ..tokenization_utils import PreTrainedTokenizer class DataCollator(ABC): """ A `DataCollator` is responsible for batching and pre-processing samples of data as requested by the training loop. """ @abstractmethod def collate_batch(self) -> Dict[str, torch.Tensor]: """ Take a list of samples from a Dataset and collate them into a batch. Returns: A dictionary of tensors """ pass InputDataClass = NewType("InputDataClass", Any) @dataclass class DefaultDataCollator(DataCollator): """ Very simple data collator that: - simply collates batches of dict-like objects - Performs special handling for potential keys named: - `label`: handles a single value (int or float) per object - `label_ids`: handles a list of values per object - does not do any additional preprocessing i.e., Property names of the input object will be used as corresponding inputs to the model. See glue and ner for example of how it's useful. """ def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]: # In this method we'll make the assumption that all `features` in the batch # have the same attributes. # So we will look at the first element as a proxy for what attributes exist # on the whole batch. first = features[0] # Special handling for labels. # Ensure that tensor is created with the correct type # (it should be automatically the case, but let's make sure of it.) if hasattr(first, "label") and first.label is not None: if type(first.label) is int: labels = torch.tensor([f.label for f in features], dtype=torch.long) else: labels = torch.tensor([f.label for f in features], dtype=torch.float) batch = {"labels": labels} elif hasattr(first, "label_ids") and first.label_ids is not None: if type(first.label_ids[0]) is int: labels = torch.tensor([f.label_ids for f in features], dtype=torch.long) else: labels = torch.tensor([f.label_ids for f in features], dtype=torch.float) batch = {"labels": labels} else: batch = {} # Handling of all other possible attributes. # Again, we will use the first element to figure out which key/values are not None for this model. for k, v in vars(first).items(): if k not in ("label", "label_ids") and v is not None and not isinstance(v, str): batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long) return batch @dataclass class DataCollatorForLanguageModeling(DataCollator): """ Data collator used for language modeling. - collates batches of tensors, honoring their tokenizer's pad_token - preprocesses batches for masked language modeling """ tokenizer: PreTrainedTokenizer mlm: bool = True mlm_probability: float = 0.15 def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]: batch = self._tensorize_batch(examples) if self.mlm: inputs, labels = self.mask_tokens7(batch) return {"input_ids": inputs, "labels": labels} else: return {"input_ids": batch, "labels": batch} def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor: length_of_first = examples[0].size(0) are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) if are_tensors_same_length: return torch.stack(examples, dim=0) else: if self.tokenizer._pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({self.tokenizer.__class__.__name__}) does not have one." ) return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels def mask_tokens2(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): if random.random() > 0.85: if random.random() > 0.2: inputs[i][j] = self.tokenizer.mask_token_id elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) else: pass else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens3(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) probability_matrix = probability_matrix.numpy() labels = labels.numpy() covered = set() for i in range(len(probability_matrix)): for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15) and (i,j) not in covered: if random.random() > 0.85: if random.random() > 0.2: if random.random() > 0.85: for k in range(j,min(j+5,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) elif random.random() > 0.7647: for k in range(j,min(j+4,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) elif random.random() > 0.5384: for k in range(j,min(j+3,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) elif random.random() > 0.42857: for k in range(j,min(j+2,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) else: inputs[i][j] = self.tokenizer.mask_token_id covered.add((i,j)) elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) else: pass else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens4(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) inputs = inputs.numpy() ids = [i for i in range(len(inputs))] random.shuffle(ids) inputs = inputs[ids] inputs = torch.from_numpy(inputs) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) total_token = 0 for i in range(len(probability_matrix)): for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 cur_token = 0 probability_matrix = probability_matrix.numpy() labels = labels.numpy() covered = set() ngramFlag = True for i in range(len(probability_matrix)): if cur_token > total_token * 0.03: ngramFlag = False for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15) and (i,j) not in covered: if random.random() > 0.85: if random.random() > 0.2: if random.random() > 0.9 and ngramFlag: for k in range(j,min(j+4,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) cur_token += 1 elif random.random() > 0.222 and ngramFlag: for k in range(j,min(j+3,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) cur_token += 1 elif random.random() > 0.42857 and ngramFlag: for k in range(j,min(j+2,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) cur_token += 1 else: inputs[i][j] = self.tokenizer.mask_token_id covered.add((i,j)) cur_token += 1 elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) cur_token += 1 else: pass else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens5(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) covered = set() pvals = [0.4, 0.3, 0.2, 0.1] ngrams = np.arange(1, 5, dtype=np.int64) probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): cur_token = 0 total_token = 0 for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 choose = random.randint(0, 1) if choose == 0: startIndex = 0 endIndex = np.argwhere(inputs[i] == np.float32(2))[-1][0] elif choose == 1: startIndex = np.argwhere(inputs[i] == np.float32(2))[-1][0] endIndex = np.argwhere(inputs[i] == np.float32(3))[-1][0] valid_j = [index for index in range(startIndex, endIndex + 1)] for j in range(len(probability_matrix[0])): if cur_token < total_token * 0.15: if probability_matrix[i][j] == np.float32(0.15): n = np.random.choice(ngrams, p=pvals) for k in range(n): if j + k >= len(probability_matrix[0]): break if (i, j+k) in covered: continue if j+k in valid_j: if random.random() > 0.7: if random.random() > 0.2: if probability_matrix[i][j+k] == np.float32(0.15): inputs[i][j+k] = self.tokenizer.mask_token_id covered.add((i, j + k)) cur_token += 1 elif random.random() > 0.5: if probability_matrix[i][j + k] == np.float32(0.15): inputs[i][j+k] = random.randint(5, len(self.tokenizer) - 1) covered.add((i, j + k)) cur_token += 1 else: if probability_matrix[i][j + k] == np.float32(0.15): covered.add((i, j + k)) cur_token += 1 else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens6(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) covered = set() probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): cur_token = 0 total_token = 0 for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 for j in range(len(probability_matrix[0])): if cur_token > total_token*0.15: break if probability_matrix[i][j] == np.float32(0.15): if random.random() > 0.85: if random.random() > 0.2: if random.random() > 0.9: for k in range(j, min(j + 4, len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i, k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i, k)) cur_token += 1 elif random.random() > 0.222: for k in range(j, min(j + 3, len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i, k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i, k)) cur_token += 1 elif random.random() > 0.42857: for k in range(j, min(j + 2, len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i, k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i, k)) cur_token += 1 else: inputs[i][j] = self.tokenizer.mask_token_id covered.add((i, j)) cur_token += 1 elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) cur_token += 1 else: cur_token += 1 else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens7(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) covered = set() ngrams = np.arange(1, 3 + 1, dtype=np.int64) pvals = 1. / np.arange(1, 3 + 1) pvals /= pvals.sum(keepdims=True) probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): cur_token = 0 total_token = 0 for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 for j in range(len(probability_matrix[0])): if cur_token <= total_token * 0.15: n = np.random.choice(ngrams, p=pvals) if probability_matrix[i][j] == np.float32(0.15): for k in range(n): if j + k >= len(probability_matrix[0]): break if (i, j+k) in covered: continue if random.random() > 0.85: if random.random() > 0.2: if probability_matrix[i][j+k] == np.float32(0.15): inputs[i][j+k] = self.tokenizer.mask_token_id covered.add((i, j + k)) cur_token += 1 elif random.random() > 0.5: if probability_matrix[i][j + k] == np.float32(0.15): inputs[i][j+k] = random.randint(5, len(self.tokenizer) - 1) covered.add((i, j + k)) cur_token += 1 else: if probability_matrix[i][j + k] == np.float32(0.15): covered.add((i, j + k)) cur_token += 1 else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/datasets/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from .glue import GlueDataset, GlueDataTrainingArguments from .language_modeling import LineByLineTextDataset, TextDataset ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/datasets/glue.py ================================================ import logging import os import time from dataclasses import dataclass, field from enum import Enum from typing import List, Optional, Union import torch from filelock import FileLock from torch.utils.data.dataset import Dataset from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_xlm_roberta import XLMRobertaTokenizer from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors from ..processors.utils import InputFeatures logger = logging.getLogger(__name__) @dataclass class GlueDataTrainingArguments: """ Arguments pertaining to what data we are going to input our model for training and eval. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())}) data_dir: str = field( metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."} ) max_seq_length: int = field( default=128, metadata={ "help": "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded." }, ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} ) def __post_init__(self): self.task_name = self.task_name.lower() class Split(Enum): train = "train" dev = "dev" test = "test" class GlueDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ args: GlueDataTrainingArguments output_mode: str features: List[InputFeatures] def __init__( self, args: GlueDataTrainingArguments, tokenizer: PreTrainedTokenizer, limit_length: Optional[int] = None, mode: Union[str, Split] = Split.train, ): self.args = args self.processor = glue_processors[args.task_name]() self.output_mode = glue_output_modes[args.task_name] if isinstance(mode, str): try: mode = Split[mode] except KeyError: raise KeyError("mode is not a valid split name") # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name, ), ) label_list = self.processor.get_labels() if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, ): # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] self.label_list = label_list # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not args.overwrite_cache: start = time.time() self.features = torch.load(cached_features_file) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {args.data_dir}") if mode == Split.dev: examples = self.processor.get_dev_examples(args.data_dir) elif mode == Split.test: examples = self.processor.get_test_examples(args.data_dir) else: examples = self.processor.get_train_examples(args.data_dir) if limit_length is not None: examples = examples[:limit_length] self.features = glue_convert_examples_to_features( examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=self.output_mode, ) start = time.time() torch.save(self.features, cached_features_file) # ^ This seems to take a lot of time so I want to investigate why and how we can improve. logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start ) def __len__(self): return len(self.features) def __getitem__(self, i) -> InputFeatures: return self.features[i] def get_labels(self): return self.label_list ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/datasets/language_modeling.py ================================================ import logging import os import pickle import time import torch from filelock import FileLock from torch.utils.data.dataset import Dataset from ...tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) class TextDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ def __init__( self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, ): assert os.path.isfile(file_path) block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), ) # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {directory}") self.examples = [] with open(file_path, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size self.examples.append( tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]) ) # Note that we are losing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start ) def __len__(self): return len(self.examples) def __getitem__(self, i) -> torch.Tensor: return torch.tensor(self.examples[i], dtype=torch.long) class LineByLineTextDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info("Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) self.examples = batch_encoding["input_ids"] def __len__(self): return len(self.examples) def __getitem__(self, i) -> torch.Tensor: return torch.tensor(self.examples[i], dtype=torch.long) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/metrics/__init__.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. try: from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef, f1_score _has_sklearn = True except (AttributeError, ImportError): _has_sklearn = False def is_sklearn_available(): return _has_sklearn if _has_sklearn: def simple_accuracy(preds, labels): return (preds == labels).mean() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] return { "pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2, } def glue_compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "cola": return {"mcc": matthews_corrcoef(labels, preds)} elif task_name == "sst-2": return {"acc": simple_accuracy(preds, labels)} elif task_name == "mrpc": return acc_and_f1(preds, labels) elif task_name == "sts-b": return pearson_and_spearman(preds, labels) elif task_name == "qqp": return acc_and_f1(preds, labels) elif task_name == "mnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "mnli-mm": return {"acc": simple_accuracy(preds, labels)} elif task_name == "qnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "rte": return {"acc": simple_accuracy(preds, labels)} elif task_name == "wnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "hans": return {"acc": simple_accuracy(preds, labels)} else: raise KeyError(task_name) def xnli_compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "xnli": return {"acc": simple_accuracy(preds, labels)} else: raise KeyError(task_name) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/metrics/squad_metrics.py ================================================ """ Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted probability that a question is unanswerable. """ import collections import json import logging import math import re import string from transformers.tokenization_bert import BasicTokenizer logger = logging.getLogger(__name__) def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) return re.sub(regex, " ", text) def white_space_fix(text): return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) def get_tokens(s): if not s: return [] return normalize_answer(s).split() def compute_exact(a_gold, a_pred): return int(normalize_answer(a_gold) == normalize_answer(a_pred)) def compute_f1(a_gold, a_pred): gold_toks = get_tokens(a_gold) pred_toks = get_tokens(a_pred) common = collections.Counter(gold_toks) & collections.Counter(pred_toks) num_same = sum(common.values()) if len(gold_toks) == 0 or len(pred_toks) == 0: # If either is no-answer, then F1 is 1 if they agree, 0 otherwise return int(gold_toks == pred_toks) if num_same == 0: return 0 precision = 1.0 * num_same / len(pred_toks) recall = 1.0 * num_same / len(gold_toks) f1 = (2 * precision * recall) / (precision + recall) return f1 def get_raw_scores(examples, preds): """ Computes the exact and f1 scores from the examples and the model predictions """ exact_scores = {} f1_scores = {} for example in examples: qas_id = example.qas_id gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])] if not gold_answers: # For unanswerable questions, only correct answer is empty string gold_answers = [""] if qas_id not in preds: print("Missing prediction for %s" % qas_id) continue prediction = preds[qas_id] exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers) f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers) return exact_scores, f1_scores def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): new_scores = {} for qid, s in scores.items(): pred_na = na_probs[qid] > na_prob_thresh if pred_na: new_scores[qid] = float(not qid_to_has_ans[qid]) else: new_scores[qid] = s return new_scores def make_eval_dict(exact_scores, f1_scores, qid_list=None): if not qid_list: total = len(exact_scores) return collections.OrderedDict( [ ("exact", 100.0 * sum(exact_scores.values()) / total), ("f1", 100.0 * sum(f1_scores.values()) / total), ("total", total), ] ) else: total = len(qid_list) return collections.OrderedDict( [ ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total), ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total), ("total", total), ] ) def merge_eval(main_eval, new_eval, prefix): for k in new_eval: main_eval["%s_%s" % (prefix, k)] = new_eval[k] def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) cur_score = num_no_ans best_score = cur_score best_thresh = 0.0 qid_list = sorted(na_probs, key=lambda k: na_probs[k]) for i, qid in enumerate(qid_list): if qid not in scores: continue if qid_to_has_ans[qid]: diff = scores[qid] else: if preds[qid]: diff = -1 else: diff = 0 cur_score += diff if cur_score > best_score: best_score = cur_score best_thresh = na_probs[qid] has_ans_score, has_ans_cnt = 0, 0 for qid in qid_list: if not qid_to_has_ans[qid]: continue has_ans_cnt += 1 if qid not in scores: continue has_ans_score += scores[qid] return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) main_eval["best_exact"] = best_exact main_eval["best_exact_thresh"] = exact_thresh main_eval["best_f1"] = best_f1 main_eval["best_f1_thresh"] = f1_thresh main_eval["has_ans_exact"] = has_ans_exact main_eval["has_ans_f1"] = has_ans_f1 def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) cur_score = num_no_ans best_score = cur_score best_thresh = 0.0 qid_list = sorted(na_probs, key=lambda k: na_probs[k]) for _, qid in enumerate(qid_list): if qid not in scores: continue if qid_to_has_ans[qid]: diff = scores[qid] else: if preds[qid]: diff = -1 else: diff = 0 cur_score += diff if cur_score > best_score: best_score = cur_score best_thresh = na_probs[qid] return 100.0 * best_score / len(scores), best_thresh def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) main_eval["best_exact"] = best_exact main_eval["best_exact_thresh"] = exact_thresh main_eval["best_f1"] = best_f1 main_eval["best_f1_thresh"] = f1_thresh def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0): qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples} has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer] no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer] if no_answer_probs is None: no_answer_probs = {k: 0.0 for k in preds} exact, f1 = get_raw_scores(examples, preds) exact_threshold = apply_no_ans_threshold( exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold ) f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) evaluation = make_eval_dict(exact_threshold, f1_threshold) if has_answer_qids: has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids) merge_eval(evaluation, has_ans_eval, "HasAns") if no_answer_qids: no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids) merge_eval(evaluation, no_ans_eval, "NoAns") if no_answer_probs: find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer) return evaluation def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text def _get_best_indexes(logits, n_best_size): """Get the n-best logits from a list.""" index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) best_indexes = [] for i in range(len(index_and_score)): if i >= n_best_size: break best_indexes.append(index_and_score[i][0]) return best_indexes def _compute_softmax(scores): """Compute softmax probability over raw logits.""" if not scores: return [] max_score = None for score in scores: if max_score is None or score > max_score: max_score = score exp_scores = [] total_sum = 0.0 for score in scores: x = math.exp(score - max_score) exp_scores.append(x) total_sum += x probs = [] for score in exp_scores: probs.append(score / total_sum) return probs def compute_predictions_logits( all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, version_2_with_negative, null_score_diff_threshold, tokenizer, ): """Write final predictions to the json file and log-odds of null if needed.""" if output_prediction_file: logger.info(f"Writing predictions to: {output_prediction_file}") if output_nbest_file: logger.info(f"Writing nbest to: {output_nbest_file}") if output_null_log_odds_file and version_2_with_negative: logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}") example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] ) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive min_null_feature_index = 0 # the paragraph slice with min null score null_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: feature_null_score = result.start_logits[0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index null_start_logit = result.start_logits[0] null_end_logit = result.end_logits[0] for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if start_index not in feature.token_to_orig_map: continue if end_index not in feature.token_to_orig_map: continue if not feature.token_is_max_context.get(start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], end_logit=result.end_logits[end_index], ) ) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( feature_index=min_null_feature_index, start_index=0, end_index=0, start_logit=null_start_logit, end_logit=null_end_logit, ) ) prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_logit", "end_logit"] ) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # tok_text = " ".join(tok_tokens) # # # De-tokenize WordPieces that have been split off. # tok_text = tok_text.replace(" ##", "") # tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True else: final_text = "" seen_predictions[final_text] = True nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_logit + entry.end_logit) if not best_non_null_entry: if entry.text: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_logit"] = entry.start_logit output["end_logit"] = entry.end_logit nbest_json.append(output) assert len(nbest_json) >= 1 if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" else: all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json if output_prediction_file: with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") if output_nbest_file: with open(output_nbest_file, "w") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") if output_null_log_odds_file and version_2_with_negative: with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") return all_predictions def compute_predictions_log_probs( all_examples, all_features, all_results, n_best_size, max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, version_2_with_negative, tokenizer, verbose_logging, ): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"] ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] ) logger.info("Writing predictions to: %s", output_prediction_file) # logger.info("Writing nbest to: %s" % (output_nbest_file)) example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] cur_null_score = result.cls_logits # if we could have irrelevant answers, get the min score of irrelevant score_null = min(score_null, cur_null_score) for i in range(start_n_top): for j in range(end_n_top): start_log_prob = result.start_logits[i] start_index = result.start_top_index[i] j_index = i * end_n_top + j end_log_prob = result.end_logits[j_index] end_index = result.end_top_index[j_index] # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= feature.paragraph_len - 1: continue if end_index >= feature.paragraph_len - 1: continue if not feature.token_is_max_context.get(start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, end_log_prob=end_log_prob, ) ) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True ) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] # XLNet un-tokenizer # Let's keep it simple for now and see if we need all this later. # # tok_start_to_orig_index = feature.tok_start_to_orig_index # tok_end_to_orig_index = feature.tok_end_to_orig_index # start_orig_pos = tok_start_to_orig_index[pred.start_index] # end_orig_pos = tok_end_to_orig_index[pred.end_index] # paragraph_text = example.paragraph_text # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) if hasattr(tokenizer, "do_lower_case"): do_lower_case = tokenizer.do_lower_case else: do_lower_case = tokenizer.do_lowercase_and_remove_accent final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob) ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6)) total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_log_prob + entry.end_log_prob) if not best_non_null_entry: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_log_prob"] = entry.start_log_prob output["end_log_prob"] = entry.end_log_prob nbest_json.append(output) assert len(nbest_json) >= 1 assert best_non_null_entry is not None score_diff = score_null scores_diff_json[example.qas_id] = score_diff # note(zhiliny): always predict best_non_null_entry # and the evaluation script will search for the best threshold all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") with open(output_nbest_file, "w") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") if version_2_with_negative: with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") return all_predictions ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/processors/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/processors/glue.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ GLUE processors and helpers """ import logging import os from enum import Enum from typing import List, Optional, Union from ...file_utils import is_tf_available from ...tokenization_utils import PreTrainedTokenizer from .utils import DataProcessor, InputExample, InputFeatures if is_tf_available(): import tensorflow as tf logger = logging.getLogger(__name__) def glue_convert_examples_to_features( examples: Union[List[InputExample], "tf.data.Dataset"], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length. Defaults to the tokenizer's max_len task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ if is_tf_available() and isinstance(examples, tf.data.Dataset): if task is None: raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.") return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) return _glue_convert_examples_to_features( examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode ) if is_tf_available(): def _tf_glue_convert_examples_to_features( examples: tf.data.Dataset, tokenizer: PreTrainedTokenizer, task=str, max_length: Optional[int] = None, ) -> tf.data.Dataset: """ Returns: A ``tf.data.Dataset`` containing the task-specific features. """ processor = glue_processors[task]() examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples] features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, ex.label, ) return tf.data.Dataset.from_generator( gen, ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), ) def _glue_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features class OutputMode(Enum): classification = "classification" regression = "regression" class MrpcProcessor(DataProcessor): """Processor for the MRPC data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line[3] text_b = line[4] label = None if set_type == "test" else line[0] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class MnliProcessor(DataProcessor): """Processor for the MultiNLI data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["premise"].numpy().decode("utf-8"), tensor_dict["hypothesis"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched") def get_labels(self): """See base class.""" return ["contradiction", "entailment", "neutral"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[8] text_b = line[9] label = None if set_type.startswith("test") else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class MnliMismatchedProcessor(MnliProcessor): """Processor for the MultiNLI Mismatched data set (GLUE version).""" def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched") class ColaProcessor(DataProcessor): """Processor for the CoLA data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence"].numpy().decode("utf-8"), None, str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" test_mode = set_type == "test" if test_mode: lines = lines[1:] text_index = 1 if test_mode else 3 examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = line[text_index] label = None if test_mode else line[1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples class Sst2Processor(DataProcessor): """Processor for the SST-2 data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence"].numpy().decode("utf-8"), None, str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] text_index = 1 if set_type == "test" else 0 for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line[text_index] label = None if set_type == "test" else line[1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples class StsbProcessor(DataProcessor): """Processor for the STS-B data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return [None] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[7] text_b = line[8] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class QqpProcessor(DataProcessor): """Processor for the QQP data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["question1"].numpy().decode("utf-8"), tensor_dict["question2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" test_mode = set_type == "test" q1_index = 1 if test_mode else 3 q2_index = 2 if test_mode else 4 examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) try: text_a = line[q1_index] text_b = line[q2_index] label = None if test_mode else line[5] except IndexError: continue examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class QnliProcessor(DataProcessor): """Processor for the QNLI data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["question"].numpy().decode("utf-8"), tensor_dict["sentence"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["entailment", "not_entailment"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class RteProcessor(DataProcessor): """Processor for the RTE data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["entailment", "not_entailment"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class WnliProcessor(DataProcessor): """Processor for the WNLI data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples glue_tasks_num_labels = { "cola": 2, "mnli": 3, "mrpc": 2, "sst-2": 2, "sts-b": 1, "qqp": 2, "qnli": 2, "rte": 2, "wnli": 2, } glue_processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } glue_output_modes = { "cola": "classification", "mnli": "classification", "mnli-mm": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/processors/squad.py ================================================ import json import logging import os from functools import partial from multiprocessing import Pool, cpu_count import numpy as np from tqdm import tqdm from ...file_utils import is_tf_available, is_torch_available from ...tokenization_bert import whitespace_tokenize from .utils import DataProcessor if is_torch_available(): import torch from torch.utils.data import TensorDataset if is_tf_available(): import tensorflow as tf logger = logging.getLogger(__name__) def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) if text_span == tok_answer_text: return (new_start, new_end) return (input_start, input_end) def _check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" best_score = None best_span_index = None for (span_index, doc_span) in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue if position > end: continue num_left_context = position - doc_span.start num_right_context = end - position score = min(num_left_context, num_right_context) + 0.01 * doc_span.length if best_score is None or score > best_score: best_score = score best_span_index = span_index return cur_span_index == best_span_index def _new_check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" # if len(doc_spans) == 1: # return True best_score = None best_span_index = None for (span_index, doc_span) in enumerate(doc_spans): end = doc_span["start"] + doc_span["length"] - 1 if position < doc_span["start"]: continue if position > end: continue num_left_context = position - doc_span["start"] num_right_context = end - position score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"] if best_score is None or score > best_score: best_score = score best_span_index = span_index return cur_span_index == best_span_index def _is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text ) spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence ) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) ) non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict: break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) index = ( j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j ) spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens :] = 0 else: p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True) ).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = example.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( SquadFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], start_position=start_position, end_position=end_position, is_impossible=span_is_impossible, qas_id=example.qas_id, ) ) return features def squad_convert_example_to_features_init(tokenizer_for_convert): global tokenizer tokenizer = tokenizer_for_convert def squad_convert_examples_to_features( examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1, tqdm_enabled=True, ): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. Args: examples: list of :class:`~transformers1.data.processors.squad.SquadExample` tokenizer: an instance of a child of :class:`~transformers1.PreTrainedTokenizer` max_seq_length: The maximum sequence length of the inputs. doc_stride: The stride used when the context is too large and is split across several features. max_query_length: The maximum length of the query. is_training: whether to create features for model evaluation or model training. return_dataset: Default False. Either 'pt' or 'tf'. if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset threads: multiple processing threadsa-smi Returns: list of :class:`~transformers1.data.processors.squad.SquadFeatures` Example:: processor = SquadV2Processor() examples = processor.get_dev_examples(data_dir) features = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, ) """ # Defining helper methods features = [] threads = min(threads, cpu_count()) with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: annotate_ = partial( squad_convert_example_to_features, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training, ) features = list( tqdm( p.imap(annotate_, examples, chunksize=32), total=len(examples), desc="convert squad examples to features", disable=not tqdm_enabled, ) ) new_features = [] unique_id = 1000000000 example_index = 0 for example_features in tqdm( features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled ): if not example_features: continue for example_feature in example_features: example_feature.example_index = example_index example_feature.unique_id = unique_id new_features.append(example_feature) unique_id += 1 example_index += 1 features = new_features del new_features if return_dataset == "pt": if not is_torch_available(): raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float) if not is_training: all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask ) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask, all_is_impossible, ) return features, dataset elif return_dataset == "tf": if not is_tf_available(): raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.") def gen(): for i, ex in enumerate(features): yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, "feature_index": i, "qas_id": ex.qas_id, }, { "start_position": ex.start_position, "end_position": ex.end_position, "cls_index": ex.cls_index, "p_mask": ex.p_mask, "is_impossible": ex.is_impossible, }, ) # Why have we split the batch into a tuple? PyTorch just has a list of tensors. train_types = ( { "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32, "feature_index": tf.int64, "qas_id": tf.string, }, { "start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32, "is_impossible": tf.int32, }, ) train_shapes = ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), "feature_index": tf.TensorShape([]), "qas_id": tf.TensorShape([]), }, { "start_position": tf.TensorShape([]), "end_position": tf.TensorShape([]), "cls_index": tf.TensorShape([]), "p_mask": tf.TensorShape([None]), "is_impossible": tf.TensorShape([]), }, ) return tf.data.Dataset.from_generator(gen, train_types, train_shapes) else: return features class SquadProcessor(DataProcessor): """ Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively. """ train_file = None dev_file = None def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): if not evaluate: answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8") answer_start = tensor_dict["answers"]["answer_start"][0].numpy() answers = [] else: answers = [ {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")} for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"]) ] answer = None answer_start = None return SquadExample( qas_id=tensor_dict["id"].numpy().decode("utf-8"), question_text=tensor_dict["question"].numpy().decode("utf-8"), context_text=tensor_dict["context"].numpy().decode("utf-8"), answer_text=answer, start_position_character=answer_start, title=tensor_dict["title"].numpy().decode("utf-8"), answers=answers, ) def get_examples_from_dataset(self, dataset, evaluate=False): """ Creates a list of :class:`~transformers1.data.processors.squad.SquadExample` using a TFDS dataset. Args: dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")` evaluate: boolean specifying if in evaluation mode or in training mode Returns: List of SquadExample Examples:: import tensorflow_datasets as tfds dataset = tfds.load("squad") training_examples = get_examples_from_dataset(dataset, evaluate=False) evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) """ if evaluate: dataset = dataset["validation"] else: dataset = dataset["train"] examples = [] for tensor_dict in tqdm(dataset): examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples def get_train_examples(self, data_dir, filename=None): """ Returns the training examples from the data directory. Args: data_dir: Directory containing the data files used for training and evaluating. filename: None by default, specify this if the training file has a different name than the original one which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ if data_dir is None: data_dir = "" if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open( os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train") def get_dev_examples(self, data_dir, filename=None): """ Returns the evaluation example from the data directory. Args: data_dir: Directory containing the data files used for training and evaluating. filename: None by default, specify this if the evaluation file has a different name than the original one which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ if data_dir is None: data_dir = "" if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open( os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") def _create_examples(self, input_data, set_type): is_training = set_type == "train" examples = [] for entry in tqdm(input_data): title = entry["title"] for paragraph in entry["paragraphs"]: context_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position_character = None answer_text = None answers = [] if "is_impossible" in qa: is_impossible = qa["is_impossible"] else: is_impossible = False if not is_impossible: if is_training: answer = qa["answers"][0] answer_text = answer["text"] start_position_character = answer["answer_start"] else: answers = qa["answers"] example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, title=title, is_impossible=is_impossible, answers=answers, ) examples.append(example) return examples class SquadV1Processor(SquadProcessor): train_file = "train-v1.1.json" dev_file = "dev-v1.1.json" class SquadV2Processor(SquadProcessor): train_file = "train-v2.0.json" dev_file = "dev-v2.0.json" class SquadExample(object): """ A single training/test example for the Squad dataset, as loaded from disk. Args: qas_id: The example's unique identifier question_text: The question string context_text: The context string answer_text: The answer string start_position_character: The character position of the start of the answer title: The title of the example answers: None by default, this is used during evaluation. Holds answers as well as their start positions. is_impossible: False by default, set to True if the example has no possible answer. """ def __init__( self, qas_id, question_text, context_text, answer_text, start_position_character, title, answers=[], is_impossible=False, ): self.qas_id = qas_id self.question_text = question_text self.context_text = context_text self.answer_text = answer_text self.title = title self.is_impossible = is_impossible self.answers = answers self.start_position, self.end_position = 0, 0 doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True # Split on whitespace so that different tokens may be attributed to their original position. for c in self.context_text: if _is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) self.doc_tokens = doc_tokens self.char_to_word_offset = char_to_word_offset # Start and end positions only has a value during evaluation. if start_position_character is not None and not is_impossible: self.start_position = char_to_word_offset[start_position_character] self.end_position = char_to_word_offset[ min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1) ] class SquadFeatures(object): """ Single squad example features to be fed to a model. Those features are model-specific and can be crafted from :class:`~transformers1.data.processors.squad.SquadExample` using the :method:`~transformers1.data.processors.squad.squad_convert_examples_to_features` method. Args: input_ids: Indices of input sequence tokens in the vocabulary. attention_mask: Mask to avoid performing attention on padding token indices. token_type_ids: Segment token indices to indicate first and second portions of the inputs. cls_index: the index of the CLS token. p_mask: Mask identifying tokens that can be answers vs. tokens that cannot. Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer example_index: the index of the example unique_id: The unique Feature identifier paragraph_len: The length of the context token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object. If a token does not have their maximum context in this feature object, it means that another feature object has more information related to that token and should be prioritized over this feature for that token. tokens: list of tokens corresponding to the input ids token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. start_position: start of the answer token index end_position: end of the answer token index """ def __init__( self, input_ids, attention_mask, token_type_ids, cls_index, p_mask, example_index, unique_id, paragraph_len, token_is_max_context, tokens, token_to_orig_map, start_position, end_position, is_impossible, qas_id: str = None, ): self.input_ids = input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids self.cls_index = cls_index self.p_mask = p_mask self.example_index = example_index self.unique_id = unique_id self.paragraph_len = paragraph_len self.token_is_max_context = token_is_max_context self.tokens = tokens self.token_to_orig_map = token_to_orig_map self.start_position = start_position self.end_position = end_position self.is_impossible = is_impossible self.qas_id = qas_id class SquadResult(object): """ Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. Args: unique_id: The unique identifier corresponding to that example. start_logits: The logits corresponding to the start of the answer end_logits: The logits corresponding to the end of the answer """ def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): self.start_logits = start_logits self.end_logits = end_logits self.unique_id = unique_id if start_top_index: self.start_top_index = start_top_index self.end_top_index = end_top_index self.cls_logits = cls_logits ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/processors/utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import csv import dataclasses import json import logging from dataclasses import dataclass from typing import List, Optional, Union from ...file_utils import is_tf_available, is_torch_available logger = logging.getLogger(__name__) @dataclass class InputExample: """ A single training/test example for simple sequence classification. Args: guid: Unique id for the example. text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ guid: str text_a: str text_b: Optional[str] = None label: Optional[str] = None def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(dataclasses.asdict(self), indent=2) + "\n" @dataclass(frozen=True) class InputFeatures: """ A single set of features of data. Property names are the same names as the corresponding inputs to a model. Args: input_ids: Indices of input sequence tokens in the vocabulary. attention_mask: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. token_type_ids: (Optional) Segment token indices to indicate first and second portions of the inputs. Only some models use them. label: (Optional) Label corresponding to the input. Int for classification problems, float for regression problems. """ input_ids: List[int] attention_mask: Optional[List[int]] = None token_type_ids: Optional[List[int]] = None label: Optional[Union[int, float]] = None def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(dataclasses.asdict(self)) + "\n" class DataProcessor: """Base class for data converters for sequence classification data sets.""" def get_example_from_tensor_dict(self, tensor_dict): """Gets an example from a dict with tensorflow tensors Args: tensor_dict: Keys and values should match the corresponding Glue tensorflow_dataset examples. """ raise NotImplementedError() def get_train_examples(self, data_dir): """Gets a collection of `InputExample`s for the train set.""" raise NotImplementedError() def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" raise NotImplementedError() def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for the test set.""" raise NotImplementedError() def get_labels(self): """Gets the list of labels for this data set.""" raise NotImplementedError() def tfds_map(self, example): """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts examples to the correct format.""" if len(self.get_labels()) > 1: example.label = self.get_labels()[int(example.label)] return example @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" with open(input_file, "r", encoding="utf-8-sig") as f: return list(csv.reader(f, delimiter="\t", quotechar=quotechar)) class SingleSentenceClassificationProcessor(DataProcessor): """ Generic processor for a single sentence classification data set.""" def __init__(self, labels=None, examples=None, mode="classification", verbose=False): self.labels = [] if labels is None else labels self.examples = [] if examples is None else examples self.mode = mode self.verbose = verbose def __len__(self): return len(self.examples) def __getitem__(self, idx): if isinstance(idx, slice): return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx]) return self.examples[idx] @classmethod def create_from_csv( cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs ): processor = cls(**kwargs) processor.add_examples_from_csv( file_name, split_name=split_name, column_label=column_label, column_text=column_text, column_id=column_id, skip_first_row=skip_first_row, overwrite_labels=True, overwrite_examples=True, ) return processor @classmethod def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs): processor = cls(**kwargs) processor.add_examples(texts_or_text_and_labels, labels=labels) return processor def add_examples_from_csv( self, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, overwrite_labels=False, overwrite_examples=False, ): lines = self._read_tsv(file_name) if skip_first_row: lines = lines[1:] texts = [] labels = [] ids = [] for (i, line) in enumerate(lines): texts.append(line[column_text]) labels.append(line[column_label]) if column_id is not None: ids.append(line[column_id]) else: guid = "%s-%s" % (split_name, i) if split_name else "%s" % i ids.append(guid) return self.add_examples( texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples ) def add_examples( self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False ): assert labels is None or len(texts_or_text_and_labels) == len(labels) assert ids is None or len(texts_or_text_and_labels) == len(ids) if ids is None: ids = [None] * len(texts_or_text_and_labels) if labels is None: labels = [None] * len(texts_or_text_and_labels) examples = [] added_labels = set() for (text_or_text_and_label, label, guid) in zip(texts_or_text_and_labels, labels, ids): if isinstance(text_or_text_and_label, (tuple, list)) and label is None: text, label = text_or_text_and_label else: text = text_or_text_and_label added_labels.add(label) examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label)) # Update examples if overwrite_examples: self.examples = examples else: self.examples.extend(examples) # Update labels if overwrite_labels: self.labels = list(added_labels) else: self.labels = list(set(self.labels).union(added_labels)) return self.examples def get_features( self, tokenizer, max_length=None, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, return_tensors=None, ): """ Convert examples in a list of ``InputFeatures`` Args: tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(self.labels)} all_input_ids = [] for (ex_index, example) in enumerate(self.examples): if ex_index % 10000 == 0: logger.info("Tokenizing example %d", ex_index) input_ids = tokenizer.encode( example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len), ) all_input_ids.append(input_ids) batch_length = max(len(input_ids) for input_ids in all_input_ids) features = [] for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)): if ex_index % 10000 == 0: logger.info("Writing example %d/%d" % (ex_index, len(self.examples))) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = batch_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) assert len(input_ids) == batch_length, "Error with input length {} vs {}".format( len(input_ids), batch_length ) assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format( len(attention_mask), batch_length ) if self.mode == "classification": label = label_map[example.label] elif self.mode == "regression": label = float(example.label) else: raise ValueError(self.mode) if ex_index < 5 and self.verbose: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label)) if return_tensors is None: return features elif return_tensors == "tf": if not is_tf_available(): raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported") import tensorflow as tf def gen(): for ex in features: yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label) dataset = tf.data.Dataset.from_generator( gen, ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64), ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])), ) return dataset elif return_tensors == "pt": if not is_torch_available(): raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) if self.mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif self.mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset else: raise ValueError("return_tensors should be one of 'tf' or 'pt'") ================================================ FILE: code/bert-base-count3/pretrain/transformers1/data/processors/xnli.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XNLI utils (dataset loading and evaluation) """ import logging import os from .utils import DataProcessor, InputExample logger = logging.getLogger(__name__) class XnliProcessor(DataProcessor): """Processor for the XNLI dataset. Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" def __init__(self, language, train_language=None): self.language = language self.train_language = train_language def get_train_examples(self, data_dir): """See base class.""" lg = self.language if self.train_language is None else self.train_language lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % ("train", i) text_a = line[0] text_b = line[1] label = "contradiction" if line[2] == "contradictory" else line[2] assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples def get_test_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue language = line[0] if language != self.language: continue guid = "%s-%s" % ("test", i) text_a = line[6] text_b = line[7] label = line[1] assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples def get_labels(self): """See base class.""" return ["contradiction", "entailment", "neutral"] xnli_processors = { "xnli": XnliProcessor, } xnli_output_modes = { "xnli": "classification", } xnli_tasks_num_labels = { "xnli": 3, } ================================================ FILE: code/bert-base-count3/pretrain/transformers1/file.py ================================================ ================================================ FILE: code/bert-base-count3/pretrain/transformers1/file_utils.py ================================================ """ Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp Copyright by the AllenNLP authors. """ import fnmatch import json import logging import os import shutil import sys import tarfile import tempfile from contextlib import contextmanager from functools import partial, wraps from hashlib import sha256 from pathlib import Path from typing import Optional from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile import requests from filelock import FileLock from tqdm.auto import tqdm from . import __version__ logger = logging.getLogger(__name__) # pylint: disable=invalid-name try: USE_TF = os.environ.get("USE_TF", "AUTO").upper() USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"): import torch _torch_available = True # pylint: disable=invalid-name logger.info("PyTorch version {} available.".format(torch.__version__)) else: logger.info("Disabling PyTorch because USE_TF is set") _torch_available = False except ImportError: _torch_available = False # pylint: disable=invalid-name try: USE_TF = os.environ.get("USE_TF", "AUTO").upper() USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"): import tensorflow as tf assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2 _tf_available = True # pylint: disable=invalid-name logger.info("TensorFlow version {} available.".format(tf.__version__)) else: logger.info("Disabling Tensorflow because USE_TORCH is set") _tf_available = False except (ImportError, AssertionError): _tf_available = False # pylint: disable=invalid-name try: from torch.hub import _get_torch_home torch_cache_home = _get_torch_home() except ImportError: torch_cache_home = os.path.expanduser( os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch")) ) default_cache_path = os.path.join(torch_cache_home, "transformers1") PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path) PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE) TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE) WEIGHTS_NAME = "pytorch_model.bin" TF2_WEIGHTS_NAME = "tf_model.h5" TF_WEIGHTS_NAME = "model.ckpt" CONFIG_NAME = "config.json" MODEL_CARD_NAME = "modelcard.json" MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]] DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co" def is_torch_available(): return _torch_available def is_tf_available(): return _tf_available def add_start_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") return fn return docstring_decorator def add_start_docstrings_to_callable(*docstr): def docstring_decorator(fn): class_name = ":class:`~transformers1.{}`".format(fn.__qualname__.split(".")[0]) intro = " The {} forward method, overrides the :func:`__call__` special method.".format(class_name) note = r""" .. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post processing steps while the latter silently ignores them. """ fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") return fn return docstring_decorator def add_end_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = fn.__doc__ + "".join(docstr) return fn return docstring_decorator def is_remote_url(url_or_filename): parsed = urlparse(url_or_filename) return parsed.scheme in ("http", "https") def hf_bucket_url(model_id: str, filename: str, use_cdn=True) -> str: """ Resolve a model identifier, and a file name, to a HF-hosted url on either S3 or Cloudfront (a Content Delivery Network, or CDN). Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our bandwidth costs). However, it is more aggressively cached by default, so may not always reflect the latest changes to the underlying file (default TTL is 24 hours). In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3, using one or the other (or switching from one to the other) will affect caching: cached files are not shared between the two because the cached file's name contains a hash of the url. """ endpoint = CLOUDFRONT_DISTRIB_PREFIX if use_cdn else S3_BUCKET_PREFIX legacy_format = "/" not in model_id if legacy_format: return f"{endpoint}/{model_id}-{filename}" else: return f"{endpoint}/{model_id}/{filename}" def url_to_filename(url, etag=None): """ Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's, delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can identify it as a HDF5 file (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) """ url_bytes = url.encode("utf-8") url_hash = sha256(url_bytes) filename = url_hash.hexdigest() if etag: etag_bytes = etag.encode("utf-8") etag_hash = sha256(etag_bytes) filename += "." + etag_hash.hexdigest() if url.endswith(".h5"): filename += ".h5" return filename def filename_to_url(filename, cache_dir=None): """ Return the url and etag (which may be ``None``) stored for `filename`. Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): raise EnvironmentError("file {} not found".format(cache_path)) meta_path = cache_path + ".json" if not os.path.exists(meta_path): raise EnvironmentError("file {} not found".format(meta_path)) with open(meta_path, encoding="utf-8") as meta_file: metadata = json.load(meta_file) url = metadata["url"] etag = metadata["etag"] return url, etag def cached_path( url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None, extract_compressed_file=False, force_extract=False, local_files_only=False, ) -> Optional[str]: """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path. Args: cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). force_download: if True, re-dowload the file even if it's already cached in the cache dir. resume_download: if True, resume the download if incompletly recieved file is found. user_agent: Optional string or dict that will be appended to the user-agent on remote requests. extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed file in a folder along the archive. force_extract: if True when extract_compressed_file is True and the archive was already extracted, re-extract the archive and overide the folder where it was extracted. Return: None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string) otherwise """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(url_or_filename, Path): url_or_filename = str(url_or_filename) if isinstance(cache_dir, Path): cache_dir = str(cache_dir) if is_remote_url(url_or_filename): # URL, so get it from the cache (downloading if necessary) output_path = get_from_cache( url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, user_agent=user_agent, local_files_only=local_files_only, ) elif os.path.exists(url_or_filename): # File, and it exists. output_path = url_or_filename elif urlparse(url_or_filename).scheme == "": # File, but it doesn't exist. raise EnvironmentError("file {} not found".format(url_or_filename)) else: # Something unknown raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) if extract_compressed_file: if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): return output_path # Path where we extract compressed archives # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" output_dir, output_file = os.path.split(output_path) output_extract_dir_name = output_file.replace(".", "-") + "-extracted" output_path_extracted = os.path.join(output_dir, output_extract_dir_name) if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract: return output_path_extracted # Prevent parallel extractions lock_path = output_path + ".lock" with FileLock(lock_path): shutil.rmtree(output_path_extracted, ignore_errors=True) os.makedirs(output_path_extracted) if is_zipfile(output_path): with ZipFile(output_path, "r") as zip_file: zip_file.extractall(output_path_extracted) zip_file.close() elif tarfile.is_tarfile(output_path): tar_file = tarfile.open(output_path) tar_file.extractall(output_path_extracted) tar_file.close() else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) return output_path_extracted return output_path def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): ua = "transformers1/{}; python/{}".format(__version__, sys.version.split()[0]) if is_torch_available(): ua += "; torch/{}".format(torch.__version__) if is_tf_available(): ua += "; tensorflow/{}".format(tf.__version__) if isinstance(user_agent, dict): ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) elif isinstance(user_agent, str): ua += "; " + user_agent headers = {"user-agent": ua} if resume_size > 0: headers["Range"] = "bytes=%d-" % (resume_size,) response = requests.get(url, stream=True, proxies=proxies, headers=headers) if response.status_code == 416: # Range not satisfiable return content_length = response.headers.get("Content-Length") total = resume_size + int(content_length) if content_length is not None else None progress = tqdm( unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading", disable=bool(logger.getEffectiveLevel() == logging.NOTSET), ) for chunk in response.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() def get_from_cache( url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None, local_files_only=False, ) -> Optional[str]: """ Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the path to the cached file. Return: None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string) otherwise """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) os.makedirs(cache_dir, exist_ok=True) etag = None if not local_files_only: try: response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout) if response.status_code == 200: etag = response.headers.get("ETag") except (EnvironmentError, requests.exceptions.Timeout): # etag is already None pass filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible. # try to get the last downloaded one if etag is None: if os.path.exists(cache_path): return cache_path else: matching_files = [ file for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*") if not file.endswith(".json") and not file.endswith(".lock") ] if len(matching_files) > 0: return os.path.join(cache_dir, matching_files[-1]) else: # If files cannot be found and local_files_only=True, # the models might've been found if local_files_only=False # Notify the user about that if local_files_only: raise ValueError( "Cannot find the requested files in the cached path and outgoing traffic has been" " disabled. To enable model look-ups and downloads online, set 'local_files_only'" " to False." ) return None # From now on, etag is not None. if os.path.exists(cache_path) and not force_download: return cache_path # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" with FileLock(lock_path): # If the download just completed while the lock was activated. if os.path.exists(cache_path) and not force_download: # Even if returning early like here, the lock will be released. return cache_path if resume_download: incomplete_path = cache_path + ".incomplete" @contextmanager def _resumable_file_manager(): with open(incomplete_path, "a+b") as f: yield f temp_file_manager = _resumable_file_manager if os.path.exists(incomplete_path): resume_size = os.stat(incomplete_path).st_size else: resume_size = 0 else: temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) resume_size = 0 # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) logger.info("storing %s in cache at %s", url, cache_path) os.replace(temp_file.name, cache_path) logger.info("creating metadata file for %s", cache_path) meta = {"url": url, "etag": etag} meta_path = cache_path + ".json" with open(meta_path, "w") as meta_file: json.dump(meta, meta_file) return cache_path class cached_property(property): """ Descriptor that mimics @property but caches output in member variable. From tensorflow_datasets Built-in in functools from Python 3.8. """ def __get__(self, obj, objtype=None): # See docs.python.org/3/howto/descriptor.html#properties if obj is None: return self if self.fget is None: raise AttributeError("unreadable attribute") attr = "__cached_" + self.fget.__name__ cached = getattr(obj, attr, None) if cached is None: cached = self.fget(obj) setattr(obj, attr, cached) return cached def torch_required(func): # Chose a different decorator name than in tests so it's clear they are not the same. @wraps(func) def wrapper(*args, **kwargs): if is_torch_available(): return func(*args, **kwargs) else: raise ImportError(f"Method `{func.__name__}` requires PyTorch.") return wrapper def tf_required(func): # Chose a different decorator name than in tests so it's clear they are not the same. @wraps(func) def wrapper(*args, **kwargs): if is_tf_available(): return func(*args, **kwargs) else: raise ImportError(f"Method `{func.__name__}` requires TF.") return wrapper ================================================ FILE: code/bert-base-count3/pretrain/transformers1/filep.py ================================================ from transformers import GPT2LMHeadModel, GPT2Tokenizer import torch tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained('gpt2') generated = tokenizer.encode("The Manhattan bridge") context = torch.tensor([generated]) past = None for i in range(15): output, past = model(context, past=past) distribution = output[0, :] # Get the top 10 values' indices and cast them to a list top_values = distribution[-1].topk(10).indices.tolist() # Decode those into words top_words = [tokenizer.decode([x]) for x in top_values.indices.tolist()] # select words (only arbitrarily select the first three) words = words[0:3] # Cast them back to tokens which can be used as an added token selected_tokens = [tokenizer.encode(word) for word in words] generated += [argmax_token.tolist()] context = argmax_token.unsqueeze(0) print(tokenizer.decode([argmax_token.tolist()])) sequence = tokenizer.decode(generated) print(sequence) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/hf_api.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import io import os from os.path import expanduser from typing import Dict, List, Optional, Tuple import requests from tqdm import tqdm ENDPOINT = "https://huggingface.co" class S3Obj: """ Data structure that represents a file belonging to the current user. """ def __init__(self, filename: str, LastModified: str, ETag: str, Size: int, **kwargs): self.filename = filename self.LastModified = LastModified self.ETag = ETag self.Size = Size class PresignedUrl: def __init__(self, write: str, access: str, type: str, **kwargs): self.write = write self.access = access self.type = type # mime-type to send to S3. class S3Object: """ Data structure that represents a public file accessible on our S3. """ def __init__( self, key: str, # S3 object key etag: str, lastModified: str, size: int, rfilename: str, # filename relative to config.json **kwargs ): self.key = key self.etag = etag self.lastModified = lastModified self.size = size self.rfilename = rfilename class ModelInfo: """ Info about a public model accessible from our S3. """ def __init__( self, modelId: str, # id of model key: str, # S3 object key of config.json author: Optional[str] = None, downloads: Optional[int] = None, tags: List[str] = [], siblings: List[Dict] = [], # list of files that constitute the model **kwargs ): self.modelId = modelId self.key = key self.author = author self.downloads = downloads self.tags = tags self.siblings = [S3Object(**x) for x in siblings] class HfApi: def __init__(self, endpoint=None): self.endpoint = endpoint if endpoint is not None else ENDPOINT def login(self, username: str, password: str) -> str: """ Call HF API to sign in a user and get a token if credentials are valid. Outputs: token if credentials are valid Throws: requests.exceptions.HTTPError if credentials are invalid """ path = "{}/api/login".format(self.endpoint) r = requests.post(path, json={"username": username, "password": password}) r.raise_for_status() d = r.json() return d["token"] def whoami(self, token: str) -> Tuple[str, List[str]]: """ Call HF API to know "whoami" """ path = "{}/api/whoami".format(self.endpoint) r = requests.get(path, headers={"authorization": "Bearer {}".format(token)}) r.raise_for_status() d = r.json() return d["user"], d["orgs"] def logout(self, token: str) -> None: """ Call HF API to log out. """ path = "{}/api/logout".format(self.endpoint) r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}) r.raise_for_status() def presign(self, token: str, filename: str, organization: Optional[str] = None) -> PresignedUrl: """ Call HF API to get a presigned url to upload `filename` to S3. """ path = "{}/api/presign".format(self.endpoint) r = requests.post( path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename, "organization": organization}, ) r.raise_for_status() d = r.json() return PresignedUrl(**d) def presign_and_upload(self, token: str, filename: str, filepath: str, organization: Optional[str] = None) -> str: """ Get a presigned url, then upload file to S3. Outputs: url: Read-only url for the stored file on S3. """ urls = self.presign(token, filename=filename, organization=organization) # streaming upload: # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads # # Even though we presign with the correct content-type, # the client still has to specify it when uploading the file. with open(filepath, "rb") as f: pf = TqdmProgressFileReader(f) data = f if pf.total_size > 0 else "" r = requests.put(urls.write, data=data, headers={"content-type": urls.type}) r.raise_for_status() pf.close() return urls.access def list_objs(self, token: str, organization: Optional[str] = None) -> List[S3Obj]: """ Call HF API to list all stored files for user (or one of their organizations). """ path = "{}/api/listObjs".format(self.endpoint) params = {"organization": organization} if organization is not None else None r = requests.get(path, params=params, headers={"authorization": "Bearer {}".format(token)}) r.raise_for_status() d = r.json() return [S3Obj(**x) for x in d] def delete_obj(self, token: str, filename: str, organization: Optional[str] = None): """ Call HF API to delete a file stored by user """ path = "{}/api/deleteObj".format(self.endpoint) r = requests.delete( path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename, "organization": organization}, ) r.raise_for_status() def model_list(self) -> List[ModelInfo]: """ Get the public list of all the models on huggingface, including the community models """ path = "{}/api/models".format(self.endpoint) r = requests.get(path) r.raise_for_status() d = r.json() return [ModelInfo(**x) for x in d] class TqdmProgressFileReader: """ Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) and override `f.read()` so as to display a tqdm progress bar. see github.com/huggingface/transformers1/pull/2078#discussion_r354739608 for implementation details. """ def __init__(self, f: io.BufferedReader): self.f = f self.total_size = os.fstat(f.fileno()).st_size self.pbar = tqdm(total=self.total_size, leave=False) self.read = f.read f.read = self._read def _read(self, n=-1): self.pbar.update(n) return self.read(n) def close(self): self.pbar.close() class HfFolder: path_token = expanduser("~/.huggingface/token") @classmethod def save_token(cls, token): """ Save token, creating folder as needed. """ os.makedirs(os.path.dirname(cls.path_token), exist_ok=True) with open(cls.path_token, "w+") as f: f.write(token) @classmethod def get_token(cls): """ Get token or None if not existent. """ try: with open(cls.path_token, "r") as f: return f.read() except FileNotFoundError: pass @classmethod def delete_token(cls): """ Delete token. Do not fail if token does not exist. """ try: os.remove(cls.path_token) except FileNotFoundError: pass ================================================ FILE: code/bert-base-count3/pretrain/transformers1/hf_argparser.py ================================================ import dataclasses import json import sys from argparse import ArgumentParser from enum import Enum from pathlib import Path from typing import Any, Iterable, List, NewType, Tuple, Union DataClass = NewType("DataClass", Any) DataClassType = NewType("DataClassType", Any) class HfArgumentParser(ArgumentParser): """ This subclass of `argparse.ArgumentParser` uses type hints on dataclasses to generate arguments. The class is designed to play well with the native argparse. In particular, you can add more (non-dataclass backed) arguments to the parser after initialization and you'll get the output back after parsing as an additional namespace. """ dataclass_types: Iterable[DataClassType] def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]], **kwargs): """ Args: dataclass_types: Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args. kwargs: (Optional) Passed to `argparse.ArgumentParser()` in the regular way. """ super().__init__(**kwargs) if dataclasses.is_dataclass(dataclass_types): dataclass_types = [dataclass_types] self.dataclass_types = dataclass_types for dtype in self.dataclass_types: self._add_dataclass_arguments(dtype) def _add_dataclass_arguments(self, dtype: DataClassType): for field in dataclasses.fields(dtype): field_name = f"--{field.name}" kwargs = field.metadata.copy() # field.metadata is not used at all by Data Classes, # it is provided as a third-party extension mechanism. if isinstance(field.type, str): raise ImportError( "This implementation is not compatible with Postponed Evaluation of Annotations (PEP 563)," "which can be opted in from Python 3.7 with `from __future__ import annotations`." "We will add compatibility when Python 3.9 is released." ) typestring = str(field.type) for prim_type in (int, float, str): for collection in (List,): if typestring == f"typing.Union[{collection[prim_type]}, NoneType]": field.type = collection[prim_type] if typestring == f"typing.Union[{prim_type.__name__}, NoneType]": field.type = prim_type if isinstance(field.type, type) and issubclass(field.type, Enum): kwargs["choices"] = list(field.type) kwargs["type"] = field.type if field.default is not dataclasses.MISSING: kwargs["default"] = field.default elif field.type is bool: kwargs["action"] = "store_false" if field.default is True else "store_true" if field.default is True: field_name = f"--no-{field.name}" kwargs["dest"] = field.name elif hasattr(field.type, "__origin__") and issubclass(field.type.__origin__, List): kwargs["nargs"] = "+" kwargs["type"] = field.type.__args__[0] assert all( x == kwargs["type"] for x in field.type.__args__ ), "{} cannot be a List of mixed types".format(field.name) if field.default_factory is not dataclasses.MISSING: kwargs["default"] = field.default_factory() else: kwargs["type"] = field.type if field.default is not dataclasses.MISSING: kwargs["default"] = field.default else: kwargs["required"] = True self.add_argument(field_name, **kwargs) def parse_args_into_dataclasses( self, args=None, return_remaining_strings=False, look_for_args_file=True ) -> Tuple[DataClass, ...]: """ Parse command-line args into instances of the specified dataclass types. This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at: docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args Args: args: List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser) return_remaining_strings: If true, also return a list of remaining argument strings. look_for_args_file: If true, will look for a ".args" file with the same base name as the entry point script for this process, and will append its potential content to the command line args. Returns: Tuple consisting of: - the dataclass instances in the same order as they were passed to the initializer.abspath - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser after initialization. - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args) """ if look_for_args_file and len(sys.argv): args_file = Path(sys.argv[0]).with_suffix(".args") if args_file.exists(): fargs = args_file.read_text().split() args = fargs + args if args is not None else fargs + sys.argv[1:] # in case of duplicate arguments the first one has precedence # so we append rather than prepend. namespace, remaining_args = self.parse_known_args(args=args) outputs = [] for dtype in self.dataclass_types: keys = {f.name for f in dataclasses.fields(dtype)} inputs = {k: v for k, v in vars(namespace).items() if k in keys} for k in keys: delattr(namespace, k) obj = dtype(**inputs) outputs.append(obj) if len(namespace.__dict__) > 0: # additional namespace. outputs.append(namespace) if return_remaining_strings: return (*outputs, remaining_args) else: if remaining_args: raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {remaining_args}") return (*outputs,) def parse_json_file(self, json_file: str) -> Tuple[DataClass, ...]: """ Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the dataclass types. """ data = json.loads(Path(json_file).read_text()) outputs = [] for dtype in self.dataclass_types: keys = {f.name for f in dataclasses.fields(dtype)} inputs = {k: v for k, v in data.items() if k in keys} obj = dtype(**inputs) outputs.append(obj) return (*outputs,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modelcard.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Configuration base class and utilities.""" import copy import json import logging import os from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP from .file_utils import ( CONFIG_NAME, MODEL_CARD_NAME, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url, ) logger = logging.getLogger(__name__) class ModelCard: r""" Structured Model Card class. Store model card as well as methods for loading/downloading/saving model cards. Please read the following paper for details and explanation on the sections: "Model Cards for Model Reporting" by Margaret Mitchell, Simone Wu, Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer, Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards. Link: https://arxiv.org/abs/1810.03993 Note: A model card can be loaded and saved to disk. Parameters: """ def __init__(self, **kwargs): # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers) self.model_details = kwargs.pop("model_details", {}) self.intended_use = kwargs.pop("intended_use", {}) self.factors = kwargs.pop("factors", {}) self.metrics = kwargs.pop("metrics", {}) self.evaluation_data = kwargs.pop("evaluation_data", {}) self.training_data = kwargs.pop("training_data", {}) self.quantitative_analyses = kwargs.pop("quantitative_analyses", {}) self.ethical_considerations = kwargs.pop("ethical_considerations", {}) self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {}) # Open additional attributes for key, value in kwargs.items(): try: setattr(self, key, value) except AttributeError as err: logger.error("Can't set {} with value {} for {}".format(key, value, self)) raise err def save_pretrained(self, save_directory_or_file): """ Save a model card object to the directory or file `save_directory_or_file`. """ if os.path.isdir(save_directory_or_file): # If we save using the predefined names, we can load using `from_pretrained` output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME) else: output_model_card_file = save_directory_or_file self.to_json_file(output_model_card_file) logger.info("Model card saved in {}".format(output_model_card_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a :class:`~transformers1.ModelCard` from a pre-trained model model card. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a model card file saved using the :func:`~transformers1.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model card should be cached if the standard cache should not be used. kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading. - The values in kwargs of any keys which are model card attributes will be used to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. find_from_standard_name: (`optional`) boolean, default True: If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename. Can be used to directly feed a model/config url and access the colocated modelcard. return_unused_kwargs: (`optional`) bool: - If False, then this function returns just the final model card object. - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored. Examples:: modelcard = ModelCard.from_pretrained('bert-base-uncased') # Download model card from S3 and cache. modelcard = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')` modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json') modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False) """ cache_dir = kwargs.pop("cache_dir", None) proxies = kwargs.pop("proxies", None) find_from_standard_name = kwargs.pop("find_from_standard_name", True) return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: # For simplicity we use the same pretrained url than the configuration files # but with a different suffix (modelcard.json). This suffix is replaced below. model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] elif os.path.isdir(pretrained_model_name_or_path): model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): model_card_file = pretrained_model_name_or_path else: model_card_file = hf_bucket_url(pretrained_model_name_or_path, filename=MODEL_CARD_NAME, use_cdn=False) if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) model_card_file = model_card_file.replace(WEIGHTS_NAME, MODEL_CARD_NAME) model_card_file = model_card_file.replace(TF2_WEIGHTS_NAME, MODEL_CARD_NAME) try: # Load from URL or cache if already cached resolved_model_card_file = cached_path( model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False ) if resolved_model_card_file is None: raise EnvironmentError if resolved_model_card_file == model_card_file: logger.info("loading model card file {}".format(model_card_file)) else: logger.info( "loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file) ) # Load model card modelcard = cls.from_json_file(resolved_model_card_file) except (EnvironmentError, json.JSONDecodeError): # We fall back on creating an empty model card modelcard = cls() # Update model card with kwargs if needed to_remove = [] for key, value in kwargs.items(): if hasattr(modelcard, key): setattr(modelcard, key, value) to_remove.append(key) for key in to_remove: kwargs.pop(key, None) logger.info("Model card: %s", str(modelcard)) if return_unused_kwargs: return modelcard, kwargs else: return modelcard @classmethod def from_dict(cls, json_object): """Constructs a `ModelCard` from a Python dictionary of parameters.""" return cls(**json_object) @classmethod def from_json_file(cls, json_file): """Constructs a `ModelCard` from a json file of parameters.""" with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() dict_obj = json.loads(text) return cls(**dict_obj) def __eq__(self, other): return self.__dict__ == other.__dict__ def __repr__(self): return str(self.to_json_string()) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" def to_json_file(self, json_file_path): """ Save this instance to a json file.""" with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string()) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_albert.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch ALBERT model. """ import logging import math import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_albert import AlbertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer from .modeling_utils import PreTrainedModel logger = logging.getLogger(__name__) ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "albert-base-v1", "albert-large-v1", "albert-xlarge-v1", "albert-xxlarge-v1", "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", # See all ALBERT models at https://huggingface.co/models?filter=albert ] def load_tf_weights_in_albert(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): print(name) for name, array in zip(names, arrays): original_name = name # If saved from the TF HUB module name = name.replace("module/", "") # Renaming and simplifying name = name.replace("ffn_1", "ffn") name = name.replace("bert/", "albert/") name = name.replace("attention_1", "attention") name = name.replace("transform/", "") name = name.replace("LayerNorm_1", "full_layer_layer_norm") name = name.replace("LayerNorm", "attention/LayerNorm") name = name.replace("transformer/", "") # The feed forward layer had an 'intermediate' step which has been abstracted away name = name.replace("intermediate/dense/", "") name = name.replace("ffn/intermediate/output/dense/", "ffn_output/") # ALBERT attention was split between self and output which have been abstracted away name = name.replace("/output/", "/") name = name.replace("/self/", "/") # The pooler is a linear layer name = name.replace("pooler/dense", "pooler") # The classifier was simplified to predictions from cls/predictions name = name.replace("cls/predictions", "predictions") name = name.replace("predictions/attention", "predictions") # Naming was changed to be more explicit name = name.replace("embeddings/attention", "embeddings") name = name.replace("inner_group_", "albert_layers/") name = name.replace("group_", "albert_layer_groups/") # Classifier if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name): name = "classifier/" + name # No ALBERT model currently handles the next sentence prediction task if "seq_relationship" in name: name = name.replace("seq_relationship/output_", "sop_classifier/classifier/") name = name.replace("weights", "weight") name = name.split("/") # Ignore the gradients applied by the LAMB/ADAM optimizers. if ( "adam_m" in name or "adam_v" in name or "AdamWeightDecayOptimizer" in name or "AdamWeightDecayOptimizer_1" in name or "global_step" in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {} from {}".format(name, original_name)) pointer.data = torch.from_numpy(array) return model class AlbertEmbeddings(BertEmbeddings): """ Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) class AlbertAttention(BertSelfAttention): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.hidden_size = config.hidden_size self.attention_head_size = config.hidden_size // config.num_attention_heads self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.num_attention_heads, self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.query = prune_linear_layer(self.query, index) self.key = prune_linear_layer(self.key, index) self.value = prune_linear_layer(self.value, index) self.dense = prune_linear_layer(self.dense, index, dim=1) # Update hyper params and store pruned heads self.num_attention_heads = self.num_attention_heads - len(heads) self.all_head_size = self.attention_head_size * self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, input_ids, attention_mask=None, head_mask=None): mixed_query_layer = self.query(input_ids) mixed_key_layer = self.key(input_ids) mixed_value_layer = self.value(input_ids) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() # Should find a better way to do this w = ( self.dense.weight.t() .view(self.num_attention_heads, self.attention_head_size, self.hidden_size) .to(context_layer.dtype) ) b = self.dense.bias.to(context_layer.dtype) projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b projected_context_layer_dropout = self.dropout(projected_context_layer) layernormed_context_layer = self.LayerNorm(input_ids + projected_context_layer_dropout) return (layernormed_context_layer, attention_probs) if self.output_attentions else (layernormed_context_layer,) class AlbertLayer(nn.Module): def __init__(self, config): super().__init__() self.config = config self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.attention = AlbertAttention(config) self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) self.activation = ACT2FN[config.hidden_act] def forward(self, hidden_states, attention_mask=None, head_mask=None): attention_output = self.attention(hidden_states, attention_mask, head_mask) ffn_output = self.ffn(attention_output[0]) ffn_output = self.activation(ffn_output) ffn_output = self.ffn_output(ffn_output) hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0]) return (hidden_states,) + attention_output[1:] # add attentions if we output them class AlbertLayerGroup(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)]) def forward(self, hidden_states, attention_mask=None, head_mask=None): layer_hidden_states = () layer_attentions = () for layer_index, albert_layer in enumerate(self.albert_layers): layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index]) hidden_states = layer_output[0] if self.output_attentions: layer_attentions = layer_attentions + (layer_output[1],) if self.output_hidden_states: layer_hidden_states = layer_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (layer_hidden_states,) if self.output_attentions: outputs = outputs + (layer_attentions,) return outputs # last-layer hidden state, (layer hidden states), (layer attentions) class AlbertTransformer(nn.Module): def __init__(self, config): super().__init__() self.config = config self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size) self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)]) def forward(self, hidden_states, attention_mask=None, head_mask=None): hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_attentions = () if self.output_hidden_states: all_hidden_states = (hidden_states,) for i in range(self.config.num_hidden_layers): # Number of layers in a hidden group layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups) # Index of the hidden group group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) layer_group_output = self.albert_layer_groups[group_idx]( hidden_states, attention_mask, head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], ) hidden_states = layer_group_output[0] if self.output_attentions: all_attentions = all_attentions + layer_group_output[-1] if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class AlbertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = AlbertConfig base_model_prefix = "albert" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) ALBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Args: config (:class:`~transformers1.AlbertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ALBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.AlbertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.", ALBERT_START_DOCSTRING, ) class AlbertModel(AlbertPreTrainedModel): config_class = AlbertConfig load_tf_weights = load_tf_weights_in_albert base_model_prefix = "albert" def __init__(self, config): super().__init__(config) self.config = config self.embeddings = AlbertEmbeddings(config) self.encoder = AlbertTransformer(config) self.pooler = nn.Linear(config.hidden_size, config.hidden_size) self.pooler_activation = nn.Tanh() self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _resize_token_embeddings(self, new_num_tokens): old_embeddings = self.embeddings.word_embeddings new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) self.embeddings.word_embeddings = new_embeddings return self.embeddings.word_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers. These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer, while [2,3] correspond to the two inner groups of the second hidden layer. Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more information about head pruning """ for layer, heads in heads_to_prune.items(): group_idx = int(layer / self.config.inner_group_num) inner_group_idx = int(layer - group_idx * self.config.inner_group_num) self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Example:: from transformers1 import AlbertModel, AlbertTokenizer import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertModel.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) sequence_output = encoder_outputs[0] pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) outputs = (sequence_output, pooled_output) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs @add_start_docstrings( """Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `sentence order prediction (classification)` head. """, ALBERT_START_DOCSTRING, ) class AlbertForPreTraining(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.albert = AlbertModel(config) self.predictions = AlbertMLMHead(config) self.sop_classifier = AlbertSOPHead(config) self.init_weights() self.tie_weights() def tie_weights(self): self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings) def get_output_embeddings(self): return self.predictions.decoder @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, sentence_order_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import AlbertTokenizer, AlbertForPreTraining import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForPreTraining.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, sop_scores = outputs[:2] """ outputs = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output) outputs = (prediction_scores, sop_scores,) + outputs[2:] # add hidden states and attention if they are here if masked_lm_labels is not None and sentence_order_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) total_loss = masked_lm_loss + sentence_order_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, sop_scores, (hidden_states), (attentions) class AlbertMLMHead(nn.Module): def __init__(self, config): super().__init__() self.LayerNorm = nn.LayerNorm(config.embedding_size) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) self.dense = nn.Linear(config.hidden_size, config.embedding_size) self.decoder = nn.Linear(config.embedding_size, config.vocab_size) self.activation = ACT2FN[config.hidden_act] # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.LayerNorm(hidden_states) hidden_states = self.decoder(hidden_states) prediction_scores = hidden_states return prediction_scores class AlbertSOPHead(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) def forward(self, pooled_output): dropout_pooled_output = self.dropout(pooled_output) logits = self.classifier(dropout_pooled_output) return logits @add_start_docstrings( "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ) class AlbertForMaskedLM(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.albert = AlbertModel(config) self.predictions = AlbertMLMHead(config) self.init_weights() self.tie_weights() def tie_weights(self): self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings) def get_output_embeddings(self): return self.predictions.decoder @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Example:: from transformers1 import AlbertTokenizer, AlbertForMaskedLM import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForMaskedLM.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_outputs = outputs[0] prediction_scores = self.predictions(sequence_outputs) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs @add_start_docstrings( """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ALBERT_START_DOCSTRING, ) class AlbertForSequenceClassification(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification (or regression if config.num_labels==1) loss. logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import AlbertTokenizer, AlbertForSequenceClassification import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForSequenceClassification.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ALBERT_START_DOCSTRING, ) class AlbertForTokenClassification(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import AlbertTokenizer, AlbertForTokenClassification import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForTokenClassification.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ALBERT_START_DOCSTRING, ) class AlbertForQuestionAnswering(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` Span-start scores (before SoftMax). end_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. from transformers1 import AlbertTokenizer, AlbertForQuestionAnswering import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_dict = tokenizer.encode_plus(question, text, return_tensors='pt') start_scores, end_scores = model(**input_dict) """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Model class. """ import logging from collections import OrderedDict from .configuration_auto import ( AlbertConfig, AutoConfig, BartConfig, BertConfig, CamembertConfig, CTRLConfig, DistilBertConfig, ElectraConfig, EncoderDecoderConfig, FlaubertConfig, GPT2Config, LongformerConfig, OpenAIGPTConfig, ReformerConfig, RobertaConfig, T5Config, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLNetConfig, ) from .configuration_marian import MarianConfig from .configuration_utils import PretrainedConfig from .modeling_albert import ( AlbertForMaskedLM, AlbertForPreTraining, AlbertForQuestionAnswering, AlbertForSequenceClassification, AlbertForTokenClassification, AlbertModel, ) from .modeling_bart import BartForConditionalGeneration, BartForSequenceClassification, BartModel from .modeling_bert import ( BertForMaskedLM, BertForMultipleChoice, BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification, BertModel, ) from .modeling_camembert import ( CamembertForMaskedLM, CamembertForMultipleChoice, CamembertForSequenceClassification, CamembertForTokenClassification, CamembertModel, ) from .modeling_ctrl import CTRLLMHeadModel, CTRLModel from .modeling_distilbert import ( DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DistilBertForTokenClassification, DistilBertModel, ) from .modeling_electra import ( ElectraForMaskedLM, ElectraForPreTraining, ElectraForSequenceClassification, ElectraForTokenClassification, ElectraModel, ) from .modeling_encoder_decoder import EncoderDecoderModel from .modeling_flaubert import ( FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertModel, FlaubertWithLMHeadModel, ) from .modeling_gpt2 import GPT2LMHeadModel, GPT2Model from .modeling_longformer import ( LongformerForMaskedLM, LongformerForMultipleChoice, LongformerForQuestionAnswering, LongformerForSequenceClassification, LongformerForTokenClassification, LongformerModel, ) from .modeling_marian import MarianMTModel from .modeling_openai import OpenAIGPTLMHeadModel, OpenAIGPTModel from .modeling_reformer import ReformerModel, ReformerModelWithLMHead from .modeling_roberta import ( RobertaForMaskedLM, RobertaForMultipleChoice, RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, ) from .modeling_t5 import T5ForConditionalGeneration, T5Model from .modeling_transfo_xl import TransfoXLLMHeadModel, TransfoXLModel from .modeling_xlm import ( XLMForQuestionAnsweringSimple, XLMForSequenceClassification, XLMForTokenClassification, XLMModel, XLMWithLMHeadModel, ) from .modeling_xlm_roberta import ( XLMRobertaForMaskedLM, XLMRobertaForMultipleChoice, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLMRobertaModel, ) from .modeling_xlnet import ( XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple, XLNetForSequenceClassification, XLNetForTokenClassification, XLNetLMHeadModel, XLNetModel, ) logger = logging.getLogger(__name__) MODEL_MAPPING = OrderedDict( [ (T5Config, T5Model), (DistilBertConfig, DistilBertModel), (AlbertConfig, AlbertModel), (CamembertConfig, CamembertModel), (XLMRobertaConfig, XLMRobertaModel), (BartConfig, BartModel), (LongformerConfig, LongformerModel), (RobertaConfig, RobertaModel), (BertConfig, BertModel), (OpenAIGPTConfig, OpenAIGPTModel), (GPT2Config, GPT2Model), (TransfoXLConfig, TransfoXLModel), (XLNetConfig, XLNetModel), (FlaubertConfig, FlaubertModel), (XLMConfig, XLMModel), (CTRLConfig, CTRLModel), (ElectraConfig, ElectraModel), (ReformerConfig, ReformerModel), ] ) MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( [ (T5Config, T5ForConditionalGeneration), (DistilBertConfig, DistilBertForMaskedLM), (AlbertConfig, AlbertForPreTraining), (CamembertConfig, CamembertForMaskedLM), (XLMRobertaConfig, XLMRobertaForMaskedLM), (BartConfig, BartForConditionalGeneration), (LongformerConfig, LongformerForMaskedLM), (RobertaConfig, RobertaForMaskedLM), (BertConfig, BertForPreTraining), (OpenAIGPTConfig, OpenAIGPTLMHeadModel), (GPT2Config, GPT2LMHeadModel), (TransfoXLConfig, TransfoXLLMHeadModel), (XLNetConfig, XLNetLMHeadModel), (FlaubertConfig, FlaubertWithLMHeadModel), (XLMConfig, XLMWithLMHeadModel), (CTRLConfig, CTRLLMHeadModel), (ElectraConfig, ElectraForPreTraining), ] ) MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ (T5Config, T5ForConditionalGeneration), (DistilBertConfig, DistilBertForMaskedLM), (AlbertConfig, AlbertForMaskedLM), (CamembertConfig, CamembertForMaskedLM), (XLMRobertaConfig, XLMRobertaForMaskedLM), (MarianConfig, MarianMTModel), (BartConfig, BartForConditionalGeneration), (LongformerConfig, LongformerForMaskedLM), (RobertaConfig, RobertaForMaskedLM), (BertConfig, BertForMaskedLM), (OpenAIGPTConfig, OpenAIGPTLMHeadModel), (GPT2Config, GPT2LMHeadModel), (TransfoXLConfig, TransfoXLLMHeadModel), (XLNetConfig, XLNetLMHeadModel), (FlaubertConfig, FlaubertWithLMHeadModel), (XLMConfig, XLMWithLMHeadModel), (CTRLConfig, CTRLLMHeadModel), (ElectraConfig, ElectraForMaskedLM), (EncoderDecoderConfig, EncoderDecoderModel), (ReformerConfig, ReformerModelWithLMHead), ] ) MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, DistilBertForSequenceClassification), (AlbertConfig, AlbertForSequenceClassification), (CamembertConfig, CamembertForSequenceClassification), (XLMRobertaConfig, XLMRobertaForSequenceClassification), (BartConfig, BartForSequenceClassification), (LongformerConfig, LongformerForSequenceClassification), (RobertaConfig, RobertaForSequenceClassification), (BertConfig, BertForSequenceClassification), (XLNetConfig, XLNetForSequenceClassification), (FlaubertConfig, FlaubertForSequenceClassification), (XLMConfig, XLMForSequenceClassification), (ElectraConfig, ElectraForSequenceClassification), ] ) MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ (DistilBertConfig, DistilBertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering), (LongformerConfig, LongformerForQuestionAnswering), (RobertaConfig, RobertaForQuestionAnswering), (BertConfig, BertForQuestionAnswering), (XLNetConfig, XLNetForQuestionAnsweringSimple), (FlaubertConfig, FlaubertForQuestionAnsweringSimple), (XLMConfig, XLMForQuestionAnsweringSimple), ] ) MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, DistilBertForTokenClassification), (CamembertConfig, CamembertForTokenClassification), (XLMConfig, XLMForTokenClassification), (XLMRobertaConfig, XLMRobertaForTokenClassification), (LongformerConfig, LongformerForTokenClassification), (RobertaConfig, RobertaForTokenClassification), (BertConfig, BertForTokenClassification), (XLNetConfig, XLNetForTokenClassification), (AlbertConfig, AlbertForTokenClassification), (ElectraConfig, ElectraForTokenClassification), ] ) MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [ (CamembertConfig, CamembertForMultipleChoice), (XLMRobertaConfig, XLMRobertaForMultipleChoice), (LongformerConfig, LongformerForMultipleChoice), (RobertaConfig, RobertaForMultipleChoice), (BertConfig, BertForMultipleChoice), (XLNetConfig, XLNetForMultipleChoice), ] ) class AutoModel: r""" :class:`~transformers1.AutoModel` is a generic model class that will be instantiated as one of the base model classes of the library when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or the `AutoModel.from_config(config)` class methods. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModel is designed to be instantiated " "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModel.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertModel` (DistilBERT model) - isInstance of `longformer` configuration class: :class:`~transformers1.LongformerModel` (Longformer model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaModel` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertModel` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.OpenAIGPTModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.GPT2Model` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.CTRLModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TransfoXLModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMModel` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertModel` (Flaubert model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraModel` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the base model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5Model` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertModel` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertModel` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertModel` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaModel` (XLM-RoBERTa model) - `longformer` :class:`~transformers1.LongformerModel` (Longformer model) - `roberta`: :class:`~transformers1.RobertaModel` (RoBERTa model) - `bert`: :class:`~transformers1.BertModel` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2Model` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetModel` (XLNet model) - `xlm`: :class:`~transformers1.XLMModel` (XLM model) - `ctrl`: :class:`~transformers1.CTRLModel` (Salesforce CTRL model) - `flaubert`: :class:`~transformers1.FlaubertModel` (Flaubert model) - `electra`: :class:`~transformers1.ElectraModel` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_MAPPING.keys()) ) ) class AutoModelForPreTraining: r""" :class:`~transformers1.AutoModelForPreTraining` is a generic model class that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForPreTraining is designed to be instantiated " "using the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForPreTraining.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - isInstance of `longformer` configuration class: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertForPreTraining` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraForPreTraining` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5ModelWithLMHead` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForMaskedLM` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertForMaskedLM` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForMaskedLM` (XLM-RoBERTa model) - `longformer`: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - `roberta`: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - `bert`: :class:`~transformers1.BertForPreTraining` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - `xlm`: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - `ctrl`: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - `flaubert`: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - `electra`: :class:`~transformers1.ElectraForPreTraining` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForPreTraining.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForPreTraining.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForPreTraining.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) class AutoModelWithLMHead: r""" :class:`~transformers1.AutoModelWithLMHead` is a generic model class that will be instantiated as one of the language modeling model classes of the library when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelWithLMHead is designed to be instantiated " "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelWithLMHead.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - isInstance of `longformer` configuration class: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertForMaskedLM` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraForMaskedLM` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the language modeling model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5ModelWithLMHead` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForMaskedLM` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertForMaskedLM` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForMaskedLM` (XLM-RoBERTa model) - `longformer`: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - `roberta`: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - `bert`: :class:`~transformers1.BertForMaskedLM` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - `xlm`: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - `ctrl`: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - `flaubert`: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - `electra`: :class:`~transformers1.ElectraForMaskedLM` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) class AutoModelForSequenceClassification: r""" :class:`~transformers1.AutoModelForSequenceClassification` is a generic model class that will be instantiated as one of the sequence classification model classes of the library when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForSequenceClassification is designed to be instantiated " "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForSequenceClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForSequenceClassification` (DistilBERT model) - isInstance of `albert` configuration class: :class:`~transformers1.AlbertForSequenceClassification` (ALBERT model) - isInstance of `camembert` configuration class: :class:`~transformers1.CamembertForSequenceClassification` (CamemBERT model) - isInstance of `xlm roberta` configuration class: :class:`~transformers1.XLMRobertaForSequenceClassification` (XLM-RoBERTa model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaForSequenceClassification` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertForSequenceClassification` (Bert model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetForSequenceClassification` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMForSequenceClassification` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertForSequenceClassification` (Flaubert model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the sequence classification model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: :class:`~transformers1.DistilBertForSequenceClassification` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForSequenceClassification` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertForSequenceClassification` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForSequenceClassification` (XLM-RoBERTa model) - `roberta`: :class:`~transformers1.RobertaForSequenceClassification` (RoBERTa model) - `bert`: :class:`~transformers1.BertForSequenceClassification` (Bert model) - `xlnet`: :class:`~transformers1.XLNetForSequenceClassification` (XLNet model) - `flaubert`: :class:`~transformers1.FlaubertForSequenceClassification` (Flaubert model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaining positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) class AutoModelForQuestionAnswering: r""" :class:`~transformers1.AutoModelForQuestionAnswering` is a generic model class that will be instantiated as one of the question answering model classes of the library when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForQuestionAnswering is designed to be instantiated " "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForQuestionAnswering.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForQuestionAnswering` (DistilBERT model) - isInstance of `albert` configuration class: :class:`~transformers1.AlbertForQuestionAnswering` (ALBERT model) - isInstance of `bert` configuration class: :class:`~transformers1.BertModelForQuestionAnswering` (Bert model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetForQuestionAnswering` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMForQuestionAnswering` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertForQuestionAnswering` (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForQuestionAnswering.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: :class:`~transformers1.DistilBertForQuestionAnswering` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForQuestionAnswering` (ALBERT model) - `bert`: :class:`~transformers1.BertForQuestionAnswering` (Bert model) - `xlnet`: :class:`~transformers1.XLNetForQuestionAnswering` (XLNet model) - `xlm`: :class:`~transformers1.XLMForQuestionAnswering` (XLM model) - `flaubert`: :class:`~transformers1.FlaubertForQuestionAnswering` (XLM model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) class AutoModelForTokenClassification: r""" :class:`~transformers1.AutoModelForTokenClassification` is a generic model class that will be instantiated as one of the token classification model classes of the library when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForTokenClassification is designed to be instantiated " "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForTokenClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertModelForTokenClassification` (DistilBERT model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMForTokenClassification` (XLM model) - isInstance of `xlm roberta` configuration class: :class:`~transformers1.XLMRobertaModelForTokenClassification` (XLMRoberta model) - isInstance of `bert` configuration class: :class:`~transformers1.BertModelForTokenClassification` (Bert model) - isInstance of `albert` configuration class: :class:`~transformers1.AlbertForTokenClassification` (AlBert model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetModelForTokenClassification` (XLNet model) - isInstance of `camembert` configuration class: :class:`~transformers1.CamembertModelForTokenClassification` (Camembert model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaModelForTokenClassification` (Roberta model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraForTokenClassification` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForTokenClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: :class:`~transformers1.DistilBertForTokenClassification` (DistilBERT model) - `xlm`: :class:`~transformers1.XLMForTokenClassification` (XLM model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForTokenClassification` (XLM-RoBERTa?Para model) - `camembert`: :class:`~transformers1.CamembertForTokenClassification` (Camembert model) - `bert`: :class:`~transformers1.BertForTokenClassification` (Bert model) - `xlnet`: :class:`~transformers1.XLNetForTokenClassification` (XLNet model) - `roberta`: :class:`~transformers1.RobertaForTokenClassification` (Roberta model) - `electra`: :class:`~transformers1.ElectraForTokenClassification` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForTokenClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) class AutoModelForMultipleChoice: r""" :class:`~transformers1.AutoModelForMultipleChoice` is a generic model class that will be instantiated as one of the multiple choice model classes of the library when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForMultipleChoice is designed to be instantiated " "using the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForMultipleChoice.from_config(config)` methods." ) @classmethod def from_config(cls, config): for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_bart.py ================================================ # coding=utf-8 # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch BART model, ported from the fairseq repo.""" import logging import math import random from typing import Dict, List, Optional, Tuple import numpy as np import torch import torch.nn.functional as F from torch import Tensor, nn from .activations import ACT2FN from .configuration_bart import BartConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, create_position_ids_from_input_ids logger = logging.getLogger(__name__) BART_PRETRAINED_MODEL_ARCHIVE_LIST = [ "facebook/bart-large", "facebook/bart-large-mnli", "facebook/bart-large-cnn", "facebook/bart-large-xsum", "facebook/mbart-large-en-ro", # See all BART models at https://huggingface.co/models?filter=bart ] BART_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and behavior. Parameters: config (:class:`~transformers1.BartConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ BART_GENERATION_EXAMPLE = r""" Examples:: from transformers1 import BartTokenizer, BartForConditionalGeneration, BartConfig # see ``examples/summarization/bart/evaluate_cnn.py`` for a longer example model = BartForConditionalGeneration.from_pretrained('bart-large-cnn') tokenizer = BartTokenizer.from_pretrained('bart-large-cnn') ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') # Generate Summary summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) """ BART_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Use BartTokenizer.encode to produce them. Padding will be ignored by default should you provide it. Indices can be obtained using :class:`transformers1.BartTokenizer.encode(text)`. attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices in input_ids. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for translation and summarization training. By default, the model will create this tensor by shifting the input_ids right, following the paper. decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. If you want to change padding behavior, you should read :func:`~transformers1.modeling_bart._prepare_decoder_inputs` and modify. See diagram 1 in the paper for more info on the default strategy """ def invert_mask(attention_mask): assert attention_mask.dim() == 2 return attention_mask.eq(0) def _prepare_bart_decoder_inputs( config, input_ids, decoder_input_ids=None, decoder_padding_mask=None, causal_mask_dtype=torch.float32 ): """Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided. This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during generation """ pad_token_id = config.pad_token_id if decoder_input_ids is None: decoder_input_ids = shift_tokens_right(input_ids, pad_token_id) bsz, tgt_len = decoder_input_ids.size() if decoder_padding_mask is None: decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id) else: decoder_padding_mask = invert_mask(decoder_padding_mask) causal_mask = torch.triu(fill_with_neg_inf(torch.zeros(tgt_len, tgt_len)), 1).to( dtype=causal_mask_dtype, device=decoder_input_ids.device ) return decoder_input_ids, decoder_padding_mask, causal_mask class PretrainedBartModel(PreTrainedModel): config_class = BartConfig base_model_prefix = "model" def _init_weights(self, module): std = self.config.init_std if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, SinusoidalPositionalEmbedding): pass elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() @property def dummy_inputs(self): pad_token = self.config.pad_token_id input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) dummy_inputs = { "attention_mask": input_ids.ne(pad_token), "input_ids": input_ids, } return dummy_inputs def _make_linear_from_emb(emb): vocab_size, emb_size = emb.weight.shape lin_layer = nn.Linear(vocab_size, emb_size, bias=False) lin_layer.weight.data = emb.weight.data return lin_layer # Helper Functions, mostly for making masks def _check_shapes(shape_1, shape2): if shape_1 != shape2: raise AssertionError("shape mismatch: {} != {}".format(shape_1, shape2)) def shift_tokens_right(input_ids, pad_token_id): """Shift input ids one token to the right, and wrap the last non pad token (usually ).""" prev_output_tokens = input_ids.clone() index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1) prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze() prev_output_tokens[:, 1:] = input_ids[:, :-1] return prev_output_tokens def make_padding_mask(input_ids, padding_idx=1): """True for pad tokens""" padding_mask = input_ids.eq(padding_idx) if not padding_mask.any(): padding_mask = None return padding_mask # Helper Modules class EncoderLayer(nn.Module): def __init__(self, config: BartConfig): super().__init__() self.embed_dim = config.d_model self.output_attentions = config.output_attentions self.self_attn = SelfAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, ) self.normalize_before = config.normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) def forward(self, x, encoder_padding_mask): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. for t_tgt, t_src is excluded (or masked out), =0 means it is included in attention Returns: encoded output of shape `(seq_len, batch, embed_dim)` """ residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) x, attn_weights = self.self_attn( query=x, key=x, key_padding_mask=encoder_padding_mask, need_weights=self.output_attentions ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.self_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = F.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.final_layer_norm(x) return x, attn_weights class BartEncoder(nn.Module): """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a :class:`EncoderLayer`. Args: config: BartConfig """ def __init__(self, config: BartConfig, embed_tokens): super().__init__() self.dropout = config.dropout self.layerdrop = config.encoder_layerdrop self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states embed_dim = embed_tokens.embedding_dim self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 self.padding_idx = embed_tokens.padding_idx self.max_source_positions = config.max_position_embeddings self.embed_tokens = embed_tokens if config.static_position_embeddings: self.embed_positions = SinusoidalPositionalEmbedding( config.max_position_embeddings, embed_dim, self.padding_idx ) else: self.embed_positions = LearnedPositionalEmbedding( config.max_position_embeddings, embed_dim, self.padding_idx, ) self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)]) self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity() # mbart has one extra layer_norm self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None def forward( self, input_ids, attention_mask=None, ): """ Args: input_ids (LongTensor): tokens in the source language of shape `(batch, src_len)` attention_mask (torch.LongTensor): indicating which indices are padding tokens. Returns: Tuple comprised of: - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *self.output_hidden_states:* is True. - **all_attentions** (List[Tensor]): Attention weights for each layer. During training might not be of length n_layers because of layer dropout. """ # check attention mask and invert if attention_mask is not None: attention_mask = invert_mask(attention_mask) inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale embed_pos = self.embed_positions(input_ids) x = inputs_embeds + embed_pos x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states, all_attentions = [], [] for encoder_layer in self.layers: if self.output_hidden_states: encoder_states.append(x) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): # skip the layer attn = None else: x, attn = encoder_layer(x, attention_mask) if self.output_attentions: all_attentions.append(attn) if self.layer_norm: x = self.layer_norm(x) if self.output_hidden_states: encoder_states.append(x) # T x B x C -> B x T x C encoder_states = [hidden_state.transpose(0, 1) for hidden_state in encoder_states] x = x.transpose(0, 1) return x, encoder_states, all_attentions class DecoderLayer(nn.Module): def __init__(self, config: BartConfig): super().__init__() self.embed_dim = config.d_model self.output_attentions = config.output_attentions self.self_attn = SelfAttention( embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.normalize_before = config.normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.encoder_attn = SelfAttention( self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) def forward( self, x, encoder_hidden_states, encoder_attn_mask=None, layer_state=None, causal_mask=None, decoder_padding_mask=None, ): residual = x if layer_state is None: layer_state = {} if self.normalize_before: x = self.self_attn_layer_norm(x) # Self Attention x, self_attn_weights = self.self_attn( query=x, key=x, layer_state=layer_state, # adds keys to layer state key_padding_mask=decoder_padding_mask, attn_mask=causal_mask, need_weights=self.output_attentions, ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.self_attn_layer_norm(x) # Cross attention residual = x assert self.encoder_attn.cache_key != self.self_attn.cache_key if self.normalize_before: x = self.encoder_attn_layer_norm(x) x, _ = self.encoder_attn( query=x, key=encoder_hidden_states, key_padding_mask=encoder_attn_mask, layer_state=layer_state, # mutates layer state ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.encoder_attn_layer_norm(x) # Fully Connected residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = F.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.final_layer_norm(x) return ( x, self_attn_weights, layer_state, ) # just self_attn weights for now, following t5, layer_state = cache for decoding class BartDecoder(nn.Module): """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DecoderLayer`. Args: config: BartConfig embed_tokens (torch.nn.Embedding): output embedding """ def __init__(self, config: BartConfig, embed_tokens: nn.Embedding): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.dropout = config.dropout self.layerdrop = config.decoder_layerdrop self.padding_idx = embed_tokens.padding_idx self.max_target_positions = config.max_position_embeddings self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 self.embed_tokens = embed_tokens if config.static_position_embeddings: self.embed_positions = SinusoidalPositionalEmbedding( config.max_position_embeddings, config.d_model, config.pad_token_id ) else: self.embed_positions = LearnedPositionalEmbedding( config.max_position_embeddings, config.d_model, self.padding_idx, ) self.layers = nn.ModuleList( [DecoderLayer(config) for _ in range(config.decoder_layers)] ) # type: List[DecoderLayer] self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity() self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None def forward( self, input_ids, encoder_hidden_states, encoder_padding_mask, decoder_padding_mask, decoder_causal_mask, decoder_cached_states=None, use_cache=False, **unused ): """ Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: input_ids (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_hidden_states: output from the encoder, used for encoder-side attention encoder_padding_mask: for ignoring pad tokens decoder_cached_states (dict or None): dictionary used for storing state during generation Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - hidden states - attentions """ # check attention mask and invert if encoder_padding_mask is not None: encoder_padding_mask = invert_mask(encoder_padding_mask) # embed positions positions = self.embed_positions(input_ids, use_cache=use_cache) if use_cache: input_ids = input_ids[:, -1:] positions = positions[:, -1:] # happens after we embed them # assert input_ids.ne(self.padding_idx).any() x = self.embed_tokens(input_ids) * self.embed_scale x += positions x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) # decoder layers all_hidden_states = () all_self_attns = () next_decoder_cache = [] for idx, decoder_layer in enumerate(self.layers): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if self.output_hidden_states: all_hidden_states += (x,) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): continue layer_state = decoder_cached_states[idx] if decoder_cached_states is not None else None x, layer_self_attn, layer_past = decoder_layer( x, encoder_hidden_states, encoder_attn_mask=encoder_padding_mask, decoder_padding_mask=decoder_padding_mask, layer_state=layer_state, causal_mask=decoder_causal_mask, ) if use_cache: next_decoder_cache.append(layer_past.copy()) if self.layer_norm and (idx == len(self.layers) - 1): # last layer of mbart x = self.layer_norm(x) if self.output_attentions: all_self_attns += (layer_self_attn,) # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) all_hidden_states = [hidden_state.transpose(0, 1) for hidden_state in all_hidden_states] x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) if use_cache: next_cache = ((encoder_hidden_states, encoder_padding_mask), next_decoder_cache) else: next_cache = None return x, next_cache, all_hidden_states, list(all_self_attns) def _reorder_buffer(attn_cache, new_order): for k, input_buffer_k in attn_cache.items(): if input_buffer_k is not None: attn_cache[k] = input_buffer_k.index_select(0, new_order) return attn_cache class SelfAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( self, embed_dim, num_heads, dropout=0.0, bias=True, encoder_decoder_attention=False, # otherwise self_attention ): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.scaling = self.head_dim ** -0.5 self.encoder_decoder_attention = encoder_decoder_attention self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self" def _shape(self, tensor, dim_0, bsz): return tensor.contiguous().view(dim_0, bsz * self.num_heads, self.head_dim).transpose(0, 1) def forward( self, query, key: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, layer_state: Optional[Dict[str, Optional[Tensor]]] = None, attn_mask: Optional[Tensor] = None, need_weights=False, ) -> Tuple[Tensor, Optional[Tensor]]: """Input shape: Time(SeqLen) x Batch x Channel""" static_kv: bool = self.encoder_decoder_attention tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] # get here for encoder decoder cause of static_kv if layer_state is not None: # reuse k,v and encoder_padding_mask saved_state = layer_state.get(self.cache_key, {}) if "prev_key" in saved_state: # previous time steps are cached - no need to recompute key and value if they are static if static_kv: key = None else: saved_state = None layer_state = {} q = self.q_proj(query) * self.scaling if static_kv: if key is None: k = v = None else: k = self.k_proj(key) v = self.v_proj(key) else: k = self.k_proj(query) v = self.v_proj(query) q = self._shape(q, tgt_len, bsz) if k is not None: k = self._shape(k, -1, bsz) if v is not None: v = self._shape(v, -1, bsz) if saved_state is not None: k, v, key_padding_mask = self._use_saved_state(k, v, saved_state, key_padding_mask, static_kv, bsz) # Update cache layer_state[self.cache_key] = { "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim), "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim), "prev_key_padding_mask": key_padding_mask if not static_kv else None, } assert k is not None src_len = k.size(1) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len) if attn_mask is not None: attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) # This is part of a workaround to get around fork/join parallelism not supporting Optional types. if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len,) if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2) attn_weights = attn_weights.masked_fill(reshaped, float("-inf")) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = F.softmax(attn_weights, dim=-1) attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,) assert v is not None attn_output = torch.bmm(attn_probs, v) assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn_output = self.out_proj(attn_output) if need_weights: attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) else: attn_weights = None return attn_output, attn_weights def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz): # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) assert k is not None and v is not None prev_key_padding_mask: Optional[Tensor] = saved_state.get("prev_key_padding_mask", None) key_padding_mask = self._cat_prev_key_padding_mask( key_padding_mask, prev_key_padding_mask, bsz, k.size(1), static_kv ) return k, v, key_padding_mask @staticmethod def _cat_prev_key_padding_mask( key_padding_mask: Optional[Tensor], prev_key_padding_mask: Optional[Tensor], batch_size: int, src_len: int, static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None: if static_kv: new_key_padding_mask = prev_key_padding_mask else: new_key_padding_mask = torch.cat([prev_key_padding_mask, key_padding_mask], dim=1) elif key_padding_mask is not None: filler = torch.zeros( batch_size, src_len - key_padding_mask.size(1), dtype=key_padding_mask.dtype, device=key_padding_mask.device, ) new_key_padding_mask = torch.cat([filler, key_padding_mask], dim=1) else: new_key_padding_mask = prev_key_padding_mask return new_key_padding_mask class BartClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" # This can trivially be shared with RobertaClassificationHead def __init__( self, input_dim, inner_dim, num_classes, pooler_dropout, ): super().__init__() self.dense = nn.Linear(input_dim, inner_dim) self.dropout = nn.Dropout(p=pooler_dropout) self.out_proj = nn.Linear(inner_dim, num_classes) def forward(self, x): x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x class LearnedPositionalEmbedding(nn.Embedding): """ This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to the forward function. """ def __init__( self, num_embeddings: int, embedding_dim: int, padding_idx: int, ): # if padding_idx is specified then offset the embedding ids by # this index and adjust num_embeddings appropriately assert padding_idx is not None num_embeddings += padding_idx + 1 # WHY? super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx) def forward(self, input, use_cache=False): """Input is expected to be of size [bsz x seqlen].""" if use_cache: # the position is our current step in the decoded sequence pos = int(self.padding_idx + input.size(1)) positions = input.data.new(1, 1).fill_(pos) else: positions = create_position_ids_from_input_ids(input, self.padding_idx) return super().forward(positions) def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True): if torch.cuda.is_available(): try: from apex.normalization import FusedLayerNorm return FusedLayerNorm(normalized_shape, eps, elementwise_affine) except ImportError: pass return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) def fill_with_neg_inf(t): """FP16-compatible function that fills a input_ids with -inf.""" return t.float().fill_(float("-inf")).type_as(t) def _filter_out_falsey_values(tup) -> Tuple: """Remove entries that are None or [] from an iterable.""" return tuple(x for x in tup if isinstance(x, torch.Tensor) or x) # Public API def _get_shape(t): return getattr(t, "shape", None) @add_start_docstrings( "The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING, ) class BartModel(PretrainedBartModel): def __init__(self, config: BartConfig): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states padding_idx, vocab_size = config.pad_token_id, config.vocab_size self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) self.encoder = BartEncoder(config, self.shared) self.decoder = BartDecoder(config, self.shared) self.init_weights() @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, encoder_outputs: Optional[Tuple] = None, decoder_attention_mask=None, decoder_cached_states=None, use_cache=False, ): # make masks if user doesn't supply if not use_cache: decoder_input_ids, decoder_padding_mask, causal_mask = _prepare_bart_decoder_inputs( self.config, input_ids, decoder_input_ids=decoder_input_ids, decoder_padding_mask=decoder_attention_mask, causal_mask_dtype=self.shared.weight.dtype, ) else: decoder_padding_mask, causal_mask = None, None assert decoder_input_ids is not None if encoder_outputs is None: encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) assert isinstance(encoder_outputs, tuple) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) decoder_outputs = self.decoder( decoder_input_ids, encoder_outputs[0], attention_mask, decoder_padding_mask, decoder_causal_mask=causal_mask, decoder_cached_states=decoder_cached_states, use_cache=use_cache, ) # Attention and hidden_states will be [] or None if they aren't needed decoder_outputs: Tuple = _filter_out_falsey_values(decoder_outputs) assert isinstance(decoder_outputs[0], torch.Tensor) encoder_outputs: Tuple = _filter_out_falsey_values(encoder_outputs) return decoder_outputs + encoder_outputs def get_input_embeddings(self): return self.shared def set_input_embeddings(self, value): self.shared = value self.encoder.embed_tokens = self.shared self.decoder.embed_tokens = self.shared def get_output_embeddings(self): return _make_linear_from_emb(self.shared) # make it on the fly @add_start_docstrings( "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING + BART_GENERATION_EXAMPLE, ) class BartForConditionalGeneration(PretrainedBartModel): base_model_prefix = "model" def __init__(self, config: BartConfig): super().__init__(config) base_model = BartModel(config) self.model = base_model self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding: old_num_tokens = self.model.shared.num_embeddings new_embeddings = super().resize_token_embeddings(new_num_tokens) self.model.shared = new_embeddings self._resize_final_logits_bias(new_num_tokens, old_num_tokens) return new_embeddings def _resize_final_logits_bias(self, new_num_tokens: int, old_num_tokens: int) -> None: if new_num_tokens <= old_num_tokens: new_bias = self.final_logits_bias[:, :new_num_tokens] else: extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) def forward( self, input_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_cached_states=None, lm_labels=None, use_cache=False, **unused ): r""" lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # Mask filling only works for bart-large from transformers1 import BartTokenizer, BartForConditionalGeneration tokenizer = BartTokenizer.from_pretrained('bart-large') TXT = "My friends are but they eat too many carbs." model = BartForConditionalGeneration.from_pretrained('bart-large') input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids'] logits = model(input_ids)[0] masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() probs = logits[0, masked_index].softmax(dim=0) values, predictions = probs.topk(5) tokenizer.decode(predictions).split() # ['good', 'great', 'all', 'really', 'very'] """ outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, encoder_outputs=encoder_outputs, decoder_attention_mask=decoder_attention_mask, decoder_cached_states=decoder_cached_states, use_cache=use_cache, ) lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias) outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here if lm_labels is not None: loss_fct = nn.CrossEntropyLoss() # TODO(SS): do we need to ignore pad tokens in lm_labels? masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step, decoder_cached_states are empty if not past[1]: encoder_outputs, decoder_cached_states = past, None else: encoder_outputs, decoder_cached_states = past return { "input_ids": None, # encoder_outputs is defined. input_ids not needed "encoder_outputs": encoder_outputs, "decoder_cached_states": decoder_cached_states, "decoder_input_ids": decoder_input_ids, "attention_mask": attention_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } def prepare_logits_for_generation(self, logits, cur_len, max_length): if cur_len == 1: self._force_token_ids_generation(logits, self.config.bos_token_id) if cur_len == max_length - 1 and self.config.eos_token_id is not None: self._force_token_ids_generation(logits, self.config.eos_token_id) return logits def _force_token_ids_generation(self, scores, token_ids) -> None: """force one of token_ids to be generated by setting prob of all other tokens to 0""" if isinstance(token_ids, int): token_ids = [token_ids] all_but_token_ids_mask = torch.tensor( [x for x in range(self.config.vocab_size) if x not in token_ids], dtype=torch.long, device=next(self.parameters()).device, ) assert len(scores.shape) == 2, "scores should be of rank 2 with shape: [batch_size, vocab_size]" scores[:, all_but_token_ids_mask] = -float("inf") @staticmethod def _reorder_cache(past, beam_idx): ((enc_out, enc_mask), decoder_cached_states) = past reordered_past = [] for layer_past in decoder_cached_states: # get the correct batch idx from decoder layer's batch dim for cross and self-attn layer_past_new = { attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items() } reordered_past.append(layer_past_new) new_enc_out = enc_out if enc_out is None else enc_out.index_select(0, beam_idx) new_enc_mask = enc_mask if enc_mask is None else enc_mask.index_select(0, beam_idx) past = ((new_enc_out, new_enc_mask), reordered_past) return past def get_encoder(self): return self.model.encoder def get_output_embeddings(self): return _make_linear_from_emb(self.model.shared) # make it on the fly @add_start_docstrings( """Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BART_START_DOCSTRING, ) class BartForSequenceClassification(PretrainedBartModel): def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classif_dropout, ) self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) def forward( self, input_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BartConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification loss (cross entropy) logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BartTokenizer, BartForSequenceClassification import torch tokenizer = BartTokenizer.from_pretrained('bart-large') model = BartForSequenceClassification.from_pretrained('bart-large') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, ) x = outputs[0] # last hidden state eos_mask = input_ids.eq(self.config.eos_token_id) if len(torch.unique(eos_mask.sum(1))) > 1: raise ValueError("All examples must have the same number of tokens.") sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :] logits = self.classification_head(sentence_representation) # Prepend logits outputs = (logits,) + outputs[1:] # Add hidden states and attention if they are here if labels is not None: # prepend loss to output, loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs class SinusoidalPositionalEmbedding(nn.Embedding): """This module produces sinusoidal positional embeddings of any length.""" def __init__(self, num_positions, embedding_dim, padding_idx=None): super().__init__(num_positions, embedding_dim) if embedding_dim % 2 != 0: raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported") self.weight = self._init_weight(self.weight) @staticmethod def _init_weight(out: nn.Parameter): """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in the 2nd half of the vector. [dim // 2:] """ n_pos, dim = out.shape position_enc = np.array( [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)] ) out[:, 0 : dim // 2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) # This line breaks for odd n_pos out[:, dim // 2 :] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False return out @torch.no_grad() def forward(self, input_ids, use_cache=False): """Input is expected to be of size [bsz x seqlen].""" bsz, seq_len = input_ids.shape[:2] if use_cache: positions = input_ids.data.new(1, 1).fill_(seq_len - 1) # called before slicing else: # starts at 0, ends at 1-seq_len positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device) return super().forward(positions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_beam_search.py ================================================ # coding=utf-8 # Copyright (c) 2019 Yang Liu # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """ A general wrapper around models with LM heads to generate sequences using beam search. """ import torch from torch import nn class TransformerBeamSearch(nn.Module): def __init__( self, model, tokenizer, batch_size, beam_size, min_length, max_length, alpha=0, block_repeating_trigram=True, ): """ Attributes: mask_word_id: token id that corresponds to the mask """ super(TransformerBeamSearch, self).__init__() self.model = model self.tokenizer = tokenizer self.start_token_id = tokenizer.start_token_id self.end_token_id = tokenizer.end_token_id self.pad_token_id = tokenizer.pad_token_id self.beam_size = beam_size self.min_length = min_length self.max_length = max_length self.block_repeating_trigram = block_repeating_trigram self.apply_length_penalty = False if alpha == 0 else True self.alpha = alpha # State of the beam self.hypotheses = [[] for _ in range(batch_size)] self.batch_offset = torch.arange(batch_size, dtype=torch.long) self.beam_offset = torch.arange( 0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long ) self.growing_beam = torch.full( (batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long ) self.topk_log_probabilities = torch.tensor( [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float ).repeat(batch_size) self.results = { "prediction": [[] for _ in batch_size], "scores": [[] for _ in batch_size], } self._step = 0 self.is_done = False def step(self, log_probabilities): """ Grows the beam by one step. """ self._step += 1 # The batch size changes as some beams finish so we define _B vocab_size = log_probabilities.size(-1) _B = log_probabilities.size(0) // self.beam_size # Multiply each beam probability with the probability of the # next token (conditioned on the words in the beam). log_probabilities += self.topk_log_probabilities.view(-1, 1) self.enforce_min_length(log_probabilities) if self.block_repeating_trigram: self.remove_repeating_trigrams(log_probabilities, _B) # Find the `beam_size` (previous_beam + token) combinations with # the highest score topk_log_probabilities, topk_ids = log_probabilities.topk( log_probabilities.view(_B, self.beam_size * vocab_size), self.beam_size, dim=1, ) # Apply the length penalty. The +1 accounts for the [EOS] token # that will be added if the beam ends. topk_scores = topk_log_probabilities / self.length_penalty() # Retrieve the corresponding respective beam and token id # topk_token_ids[i] will be added to topk_beam_ids[i] topk_beam_ids = topk_ids.div(vocab_size) topk_token_ids = topk_ids.fmod(vocab_size) # Retrieve the row index of the surviving beams in the original # view of the log_probabilities tensor surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view( -1 ) # Append the last predictions self.growing_beam = torch.cat( [ self.growing_beam.index_select(0, surviving_beams_rows), topk_token_ids.view(-1, 1), ], 1, ) # Check if any of the beam searches has ended during this # growth step. Also if top beam (most probable) has ended # for one element of the batch. is_finished = topk_token_ids.eq(self.end_token_id) self.enforce_max_length() is_top_beam_finished = is_finished[:, 0].eq(1) # Save the finished searches if is_finished.any(): predictions = self.growing_beam.view( -1, self.beam_size, self.growing_beam.size(1) ) for i in range(is_finished.size(0)): if is_top_beam_finished[i]: is_finished[i].fill_(1) finished_hyp = is_finished[i].nonzero().view(-1) # Store finished hypotheses for this batch. b = self.batch_offset[i] for j in finished_hyp: self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :])) # If the batch reached the end, save the best hypotheses # in terms of length-penalized score. if is_top_beam_finished[i]: best_hyp = sorted( self.hypotheses[b], key=lambda x: x[0], reverse=True ) best_score, best_prediction = best_hyp[0] self.results["scores"][b].append(best_score) self.results["predictions"][b].append(best_prediction) non_finished = is_top_beam_finished.eq(0).nonzero().view(-1) if len(non_finished) == 0: self.is_done = True # Remove finished batches for the next step. topk_log_probabilities = topk_log_probabilities.index_select( 0, non_finished ) self.batch_offset = self.batch_offset.index_select(0, non_finished) self.growing_beam = predictions.index_select(0, non_finished).view( -1, self.growing_beam.size(-1) ) surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished) return surviving_beams_rows def forward(self, encoder_input_ids, **kwargs): # keyword arguments come in 3 flavors: encoder-specific (prefixed by # `encoder_`), decoder-specific (prefixed by `decoder_`) and those # that apply to the model as whole. # We let the specific kwargs override the common ones in case of conflict. kwargs_encoder = { argument[len("encoder_"):]: value for argument, value in kwargs.items() if argument.startswith("encoder_") } kwargs_decoder = { argument[len("decoder_"):]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } kwargs_common = { argument: value for argument, value in kwargs.items() if not (argument.startswith("encoder_") or argument.startswith("decoder_")) } kwargs_decoder = dict(kwargs_common, **kwargs_decoder) kwargs_encoder = dict(kwargs_common, **kwargs_encoder) # forward pass on the encoder encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder) kwargs_decoder["encoder_hidden_states"] = tile( encoder_outputs, self.beam_size, dim=0 ) # grow the beam by generating sequences in an autoregressive way self.growing_beam = torch.full( (self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long ) for step in range(self.max_length): decoder_input = self.growing_beam[:, -1] outputs = self.model.decoder(decoder_input, kwargs_decoder) log_probabilities = torch.nn.functional.log_softmax(outputs[1]) surviving_beams_rows = self.step(log_probabilities) if self.is_done: break kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[ "encoder_hidden_states" ].index_select(0, surviving_beams_rows) return self.results def remove_repeating_trigrams(self, log_probabilities, _B): if(self._step + 1 > 3): for i in range(_B * self.beam_size): tokens = [t for t in self.growing_beam[i]] trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)] last_trigram = tuple(trigrams[-1]) if last_trigram in trigrams[:-1]: log_probabilities[i] = -1e20 def enforce_min_length(self): if self._step < self.min_length: self.log_probabilities[self.end_token_id] = -1e20 def enforce_max_length(self): if self._step + 1 == self.max_length: self.is_finished.fill_(1) def length_penalty(self): return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha def tile(x, count, dim=0): """ Tiles `x` along dimension `dim` `count` times. Example: >> ex = torch.tensor([1,2],[3,4]) >> tile(ex, 2, 0) torch.Tensor([[1,2],[1,2],[3,4],[3,4]]) """ perm = list(range(len(x.size()))) if dim != 0: perm[0], perm[dim] = perm[dim], perm[0] x = x.permute(perm).contiguous() out_size = list(x.size()) out_size[0] *= count batch = x.size(0) x = ( x.view(batch, -1) .transpose(0, 1) .repeat(count, 1) .transpose(0, 1) .contiguous() .view(*out_size) ) if dim != 0: x = x.permute(perm).contiguous() return x ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch BERT model. """ import logging import math import os import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from .activations import gelu, gelu_new, swish from .configuration_bert import BertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer logger = logging.getLogger(__name__) BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased", "bert-base-chinese", "bert-base-german-cased", "bert-large-uncased-whole-word-masking", "bert-large-cased-whole-word-masking", "bert-large-uncased-whole-word-masking-finetuned-squad", "bert-large-cased-whole-word-masking-finetuned-squad", "bert-base-cased-finetuned-mrpc", "bert-base-german-dbmdz-cased", "bert-base-german-dbmdz-uncased", "cl-tohoku/bert-base-japanese", "cl-tohoku/bert-base-japanese-whole-word-masking", "cl-tohoku/bert-base-japanese-char", "cl-tohoku/bert-base-japanese-char-whole-word-masking", "TurkuNLP/bert-base-finnish-cased-v1", "TurkuNLP/bert-base-finnish-uncased-v1", "wietsedv/bert-base-dutch-cased", # See all BERT models at https://huggingface.co/models?filter=bert ] def load_tf_weights_in_bert(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model. """ try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] for n in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model def mish(x): return x * torch.tanh(nn.functional.softplus(x)) ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish} BertLayerNorm = torch.nn.LayerNorm class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] device = input_ids.device if input_ids is not None else inputs_embeds.device if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand(input_shape) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. if encoder_hidden_states is not None: mixed_key_layer = self.key(encoder_hidden_states) mixed_value_layer = self.value(encoder_hidden_states) attention_mask = encoder_attention_mask else: mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class BertSelfOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertAttention(nn.Module): def __init__(self, config): super().__init__() self.self = BertSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class BertIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class BertOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertLayer(nn.Module): def __init__(self, config): super().__init__() self.attention = BertAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights if self.is_decoder and encoder_hidden_states is not None: cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) outputs = (layer_output,) + outputs return outputs class BertEncoder(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask ) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class BertPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states class BertOnlyMLMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class BertOnlyNSPHead(nn.Module): def __init__(self, config): super().__init__() self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) return seq_relationship_score class BertPreTrainingHeads(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, sequence_output, pooled_output): prediction_scores = self.predictions(sequence_output) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score class BertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, BertLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() BERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ BERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class BertModel(BertPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration set to :obj:`True`; an :obj:`encoder_hidden_states` is expected as an input to the forward pass. .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762 """ def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class BertForPreTraining(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertPreTrainingHeads(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, next_sentence_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForPreTraining import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) outputs = (prediction_scores, seq_relationship_score,) + outputs[ 2: ] # add hidden states and attention if they are here if masked_lm_labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): Next token prediction loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForMaskedLM import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here # Although this may seem awkward, BertForMaskedLM supports two scenarios: # 1. If a tensor that contains the indices of masked labels is provided, # the cross-entropy is the MLM cross-entropy that measures the likelihood # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs if lm_labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one prediction_scores = prediction_scores[:, :-1, :].contiguous() lm_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss() ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1)) outputs = (ltr_lm_loss,) + outputs return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # if model is does not use a causal mask then add a dummy token if self.config.is_decoder is False: assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" attention_mask = torch.cat( [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 ) dummy_token = torch.full( (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device ) input_ids = torch.cat([input_ids, dummy_token], dim=1) return {"input_ids": input_ids, "attention_mask": attention_mask} @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class BertForNextSentencePrediction(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, next_sentence_label=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForNextSentencePrediction import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." next_sentence = "The sky is blue due to the shorter wavelength of blue light." encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt') loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1])) assert logits[0, 0] < logits[0, 1] # next sentence was random """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if next_sentence_label is not None: loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) outputs = (next_sentence_loss,) + outputs return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class BertForSequenceClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class BertForMultipleChoice(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForMultipleChoice import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." choice0 = "It is eaten with a fork and a knife." choice1 = "It is eaten while held in the hand." labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;)) encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True) outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 # the linear classifier still needs to be trained loss, logits = outputs[:2] """ num_choices = input_ids.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class BertForTokenClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForTokenClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class BertForQuestionAnswering(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForQuestionAnswering import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) assert answer == "a nice puppet" """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_camembert.py ================================================ # coding=utf-8 # Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch CamemBERT model. """ import logging from .configuration_camembert import CamembertConfig from .file_utils import add_start_docstrings from .modeling_roberta import ( RobertaForMaskedLM, RobertaForMultipleChoice, RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, ) logger = logging.getLogger(__name__) CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "camembert-base", "Musixmatch/umberto-commoncrawl-cased-v1", "Musixmatch/umberto-wikipedia-uncased-v1", # See all CamemBERT models at https://huggingface.co/models?filter=camembert ] CAMEMBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.CamembertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", CAMEMBERT_START_DOCSTRING, ) class CamembertModel(RobertaModel): """ This class overrides :class:`~transformers1.RobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForMaskedLM(RobertaForMaskedLM): """ This class overrides :class:`~transformers1.RobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForSequenceClassification(RobertaForSequenceClassification): """ This class overrides :class:`~transformers1.RobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForMultipleChoice(RobertaForMultipleChoice): """ This class overrides :class:`~transformers1.RobertaForMultipleChoice`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForTokenClassification(RobertaForTokenClassification): """ This class overrides :class:`~transformers1.RobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits` """, CAMEMBERT_START_DOCSTRING, ) class CamembertForQuestionAnswering(RobertaForQuestionAnswering): """ This class overrides :class:`~transformers1.RobertaForQuestionAnswering`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch CTRL model.""" import logging import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .configuration_ctrl import CTRLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel logger = logging.getLogger(__name__) CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "ctrl" # See all CTRL models at https://huggingface.co/models?filter=ctrl ] def angle_defn(pos, i, d_model_size): angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size) return pos * angle_rates def positional_encoding(position, d_model_size, dtype): # create the sinusoidal pattern for the positional encoding angle_rads = angle_defn( torch.arange(position, dtype=dtype).unsqueeze(1), torch.arange(d_model_size, dtype=dtype).unsqueeze(0), d_model_size, ) sines = torch.sin(angle_rads[:, 0::2]) cosines = torch.cos(angle_rads[:, 1::2]) pos_encoding = torch.cat([sines, cosines], dim=-1) return pos_encoding def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): # calculate attention matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2)) dk = k.shape[-1] scaled_attention_logits = matmul_qk / np.sqrt(dk) if mask is not None: nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1) scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4 if attention_mask is not None: # Apply the attention mask scaled_attention_logits = scaled_attention_logits + attention_mask attention_weights = torch.softmax(scaled_attention_logits, dim=-1) # Mask heads if we want to if head_mask is not None: attention_weights = attention_weights * head_mask output = torch.matmul(attention_weights, v) return output, attention_weights class MultiHeadAttention(torch.nn.Module): def __init__(self, d_model_size, num_heads, output_attentions=False): super().__init__() self.output_attentions = output_attentions self.num_heads = num_heads self.d_model_size = d_model_size self.depth = int(d_model_size / self.num_heads) self.Wq = torch.nn.Linear(d_model_size, d_model_size) self.Wk = torch.nn.Linear(d_model_size, d_model_size) self.Wv = torch.nn.Linear(d_model_size, d_model_size) self.dense = torch.nn.Linear(d_model_size, d_model_size) def split_into_heads(self, x, batch_size): x = x.reshape(batch_size, -1, self.num_heads, self.depth) return x.permute([0, 2, 1, 3]) def forward(self, v, k, q, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): batch_size = q.shape[0] q = self.Wq(q) k = self.Wk(k) v = self.Wv(v) q = self.split_into_heads(q, batch_size) k = self.split_into_heads(k, batch_size) v = self.split_into_heads(v, batch_size) if layer_past is not None: past_key, past_value = layer_past[0], layer_past[1] k = torch.cat((past_key, k), dim=-2) v = torch.cat((past_value, v), dim=-2) if use_cache is True: present = torch.stack((k, v)) else: present = (None,) output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) scaled_attention = output[0].permute([0, 2, 1, 3]) attn = output[1] original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size) output = self.dense(original_size_attention) outputs = (output, present) if self.output_attentions: outputs = outputs + (attn,) return outputs def point_wise_feed_forward_network(d_model_size, dff): return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size)) class EncoderLayer(torch.nn.Module): def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False): super().__init__() self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions) self.ffn = point_wise_feed_forward_network(d_model_size, dff) self.layernorm1 = torch.nn.LayerNorm(d_model_size, eps=1e-6) self.layernorm2 = torch.nn.LayerNorm(d_model_size, eps=1e-6) self.dropout1 = torch.nn.Dropout(rate) self.dropout2 = torch.nn.Dropout(rate) def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): normed = self.layernorm1(x) attn_outputs = self.multi_head_attention( normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) attn_output = attn_outputs[0] attn_output = self.dropout1(attn_output) out1 = x + attn_output out2 = self.layernorm2(out1) ffn_output = self.ffn(out2) ffn_output = self.dropout2(ffn_output) out2 = out1 + ffn_output outputs = (out2,) + attn_outputs[1:] return outputs class CTRLPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = CTRLConfig base_model_prefix = "transformer" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) CTRL_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.CTRLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ CTRL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only input_ids that do not have their past calculated should be passed as input_ids. Indices can be obtained using :class:`transformers1.CTRLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The input_ids which have their past given to this model should not be passed as input ids as they have already been computed. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If `past` is used, optionally only the last `inputs_embeds` have to be input (see `past`). use_cache (:obj:`bool`): If `use_cache` is True, `past` key value states are returned and can be used to speed up decoding (see `past`). Defaults to `True`. """ @add_start_docstrings( "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", CTRL_START_DOCSTRING, ) class CTRLModel(CTRLPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.d_model_size = config.n_embd self.num_layers = config.n_layer self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float) self.w = nn.Embedding(config.vocab_size, config.n_embd) self.dropout = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList( [ EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions) for _ in range(config.n_layer) ] ) self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.init_weights() def get_input_embeddings(self): return self.w def set_input_embeddings(self, new_embeddings): self.w = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import CTRLTokenizer, CTRLModel import torch tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLModel.from_pretrained('ctrl') input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = past[0][0].size(-2) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layer) if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) token_type_embeds = self.w(token_type_ids) token_type_embeds *= np.sqrt(self.d_model_size) else: token_type_embeds = 0 position_ids = position_ids.view(-1, input_shape[-1]) if inputs_embeds is None: inputs_embeds = self.w(input_ids) # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded seq_len = input_shape[-1] mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(inputs_embeds.device) inputs_embeds *= np.sqrt(self.d_model_size) pos_embeds = self.pos_encoding[position_ids, :].to(inputs_embeds.device) hidden_states = inputs_embeds + pos_embeds + token_type_embeds hidden_states = self.dropout(hidden_states) output_shape = input_shape + (inputs_embeds.size(-1),) presents = () all_hidden_states = () all_attentions = [] for i, (h, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = h( hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], use_cache=use_cache, ) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.layernorm(hidden_states) hidden_states = hidden_states.view(*output_shape) if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs @add_start_docstrings( """The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, ) class CTRLLMHeadModel(CTRLPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = CTRLModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) self.init_weights() def get_output_embeddings(self): return self.lm_head def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: input_ids = input_ids[:, -1].unsqueeze(-1) return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=True, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import CTRLTokenizer, CTRLLMHeadModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLLMHeadModel.from_pretrained('ctrl') input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) """ import copy import logging import math import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .activations import gelu from .configuration_distilbert import DistilBertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "distilbert-base-uncased", "distilbert-base-uncased-distilled-squad", "distilbert-base-cased", "distilbert-base-cased-distilled-squad", "distilbert-base-german-cased", "distilbert-base-multilingual-cased", "distilbert-base-uncased-finetuned-sst-2-english", # See all DistilBERT models at https://huggingface.co/models?filter=distilbert ] # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False class Embeddings(nn.Module): def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: create_sinusoidal_embeddings( n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight ) self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) self.dropout = nn.Dropout(config.dropout) def forward(self, input_ids): """ Parameters ---------- input_ids: torch.tensor(bs, max_seq_length) The token ids to embed. Outputs ------- embeddings: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ seq_length = input_ids.size(1) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) return embeddings class MultiHeadSelfAttention(nn.Module): def __init__(self, config): super().__init__() self.n_heads = config.n_heads self.dim = config.dim self.dropout = nn.Dropout(p=config.attention_dropout) self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0 self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.pruned_heads = set() def prune_heads(self, heads): attention_head_size = self.dim // self.n_heads if len(heads) == 0: return mask = torch.ones(self.n_heads, attention_head_size) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.q_lin = prune_linear_layer(self.q_lin, index) self.k_lin = prune_linear_layer(self.k_lin, index) self.v_lin = prune_linear_layer(self.v_lin, index) self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, query, key, value, mask, head_mask=None): """ Parameters ---------- query: torch.tensor(bs, seq_length, dim) key: torch.tensor(bs, seq_length, dim) value: torch.tensor(bs, seq_length, dim) mask: torch.tensor(bs, seq_length) Outputs ------- weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ bs, q_length, dim = query.size() k_length = key.size(1) # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) # assert key.size() == value.size() dim_per_head = self.dim // self.n_heads mask_reshp = (bs, 1, 1, k_length) def shape(x): """ separate heads """ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) def unshape(x): """ group heads """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, q_length, k_length) weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) if self.output_attentions: return (context, weights) else: return (context,) class FFN(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(p=config.dropout) self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format( config.activation ) self.activation = gelu if config.activation == "gelu" else nn.ReLU() def forward(self, input): x = self.lin1(input) x = self.activation(x) x = self.lin2(x) x = self.dropout(x) return x class TransformerBlock(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions assert config.dim % config.n_heads == 0 self.attention = MultiHeadSelfAttention(config) self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) self.ffn = FFN(config) self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) def forward(self, x, attn_mask=None, head_mask=None): """ Parameters ---------- x: torch.tensor(bs, seq_length, dim) attn_mask: torch.tensor(bs, seq_length) Outputs ------- sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ # Self-Attention sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask) if self.output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples assert type(sa_output) == tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) if self.output_attentions: output = (sa_weights,) + output return output class Transformer(nn.Module): def __init__(self, config): super().__init__() self.n_layers = config.n_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states layer = TransformerBlock(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) def forward(self, x, attn_mask=None, head_mask=None): """ Parameters ---------- x: torch.tensor(bs, seq_length, dim) Input sequence embedded. attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. Outputs ------- hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top) layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)] Tuple of length n_layers with the hidden states from each layer. Optional: only if output_hidden_states=True all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ all_hidden_states = () all_attentions = () hidden_state = x for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i]) hidden_state = layer_outputs[-1] if self.output_attentions: assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1 # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) outputs = (hidden_state,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = DistilBertConfig load_tf_weights = None base_model_prefix = "distilbert" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, nn.Embedding): if module.weight.requires_grad: module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() DISTILBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ DISTILBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.DistilBertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", DISTILBERT_START_DOCSTRING, ) class DistilBertModel(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.transformer.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertModel import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertModel.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask) hidden_state = tfmr_output[0] output = (hidden_state,) + tfmr_output[1:] return output # last-layer hidden-state, (all hidden_states), (all attentions) @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMaskedLM(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.distilbert = DistilBertModel(config) self.vocab_transform = nn.Linear(config.dim, config.dim) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.init_weights() self.mlm_loss_fct = nn.CrossEntropyLoss() def get_output_embeddings(self): return self.vocab_projector @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForMaskedLM import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) outputs = (prediction_logits,) + dlbrt_output[1:] if masked_lm_labels is not None: mlm_loss = self.mlm_loss_fct( prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1) ) outputs = (mlm_loss,) + outputs return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) @add_start_docstrings( """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForSequenceClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForSequenceClassification import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim) outputs = (logits,) + distilbert_output[1:] if labels is not None: if self.num_labels == 1: loss_fct = nn.MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, DISTILBERT_START_DOCSTRING, ) class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.qa_outputs = nn.Linear(config.dim, config.num_labels) assert config.num_labels == 2 self.dropout = nn.Dropout(config.qa_dropout) self.init_weights() @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForQuestionAnswering import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss, start_scores, end_scores = outputs[:3] """ distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) # (bs, max_query_len) end_logits = end_logits.squeeze(-1) # (bs, max_query_len) outputs = (start_logits, end_logits,) + distilbert_output[1:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForTokenClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForTokenClassification import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_electra.py ================================================ import logging import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .activations import get_activation from .configuration_electra import ElectraConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel logger = logging.getLogger(__name__) ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/electra-small-generator", "google/electra-base-generator", "google/electra-large-generator", "google/electra-small-discriminator", "google/electra-base-discriminator", "google/electra-large-discriminator", # See all ELECTRA models at https://huggingface.co/models?filter=electra ] def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"): """ Load tf checkpoints in a pytorch model. """ try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): original_name: str = name try: if isinstance(model, ElectraForMaskedLM): name = name.replace("electra/embeddings/", "generator/embeddings/") if discriminator_or_generator == "generator": name = name.replace("electra/", "discriminator/") name = name.replace("generator/", "electra/") name = name.replace("dense_1", "dense_prediction") name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias") name = name.split("/") # print(original_name, name) # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["global_step", "temperature"] for n in name): logger.info("Skipping {}".format(original_name)) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: pointer = getattr(pointer, scope_names[0]) if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name.endswith("_embeddings"): pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape, original_name except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name), original_name) pointer.data = torch.from_numpy(array) except AttributeError as e: print("Skipping {}".format(original_name), name, e) continue return model class ElectraEmbeddings(BertEmbeddings): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config): super().__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.embedding_size, eps=config.layer_norm_eps) class ElectraDiscriminatorPredictions(nn.Module): """Prediction module for the discriminator, made up of two dense layers.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense_prediction = nn.Linear(config.hidden_size, 1) self.config = config def forward(self, discriminator_hidden_states, attention_mask): hidden_states = self.dense(discriminator_hidden_states) hidden_states = get_activation(self.config.hidden_act)(hidden_states) logits = self.dense_prediction(hidden_states).squeeze() return logits class ElectraGeneratorPredictions(nn.Module): """Prediction module for the generator, made up of two dense layers.""" def __init__(self, config): super().__init__() self.LayerNorm = BertLayerNorm(config.embedding_size) self.dense = nn.Linear(config.hidden_size, config.embedding_size) def forward(self, generator_hidden_states): hidden_states = self.dense(generator_hidden_states) hidden_states = get_activation("gelu")(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class ElectraPreTrainedModel(BertPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = ElectraConfig load_tf_weights = load_tf_weights_in_electra base_model_prefix = "electra" ELECTRA_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.ElectraConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ELECTRA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.ElectraTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ @add_start_docstrings( "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to " "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the " "hidden size and embedding size are different." "" "Both the generator and discriminator checkpoints may be loaded into this model.", ELECTRA_START_DOCSTRING, ) class ElectraModel(ElectraPreTrainedModel): config_class = ElectraConfig def __init__(self, config): super().__init__(config) self.embeddings = ElectraEmbeddings(config) if config.embedding_size != config.hidden_size: self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) self.encoder = BertEncoder(config) self.config = config self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraModel, ElectraTokenizer import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraModel.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) hidden_states = self.encoder(hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask) return hidden_states class ElectraClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x @add_start_docstrings( """ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ELECTRA_START_DOCSTRING, ) class ElectraForSequenceClassification(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.electra = ElectraModel(config) self.classifier = ElectraClassificationHead(config) self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) sequence_output = discriminator_hidden_states[0] logits = self.classifier(sequence_output) outputs = (logits,) + discriminator_hidden_states[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a binary classification head on top as used during pre-training for identifying generated tokens. It is recommended to load the discriminator checkpoint into that model.""", ELECTRA_START_DOCSTRING, ) class ElectraForPreTraining(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.electra = ElectraModel(config) self.discriminator_predictions = ElectraDiscriminatorPredictions(config) self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates the token is an original token, ``1`` indicates the token was replaced. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss of the ELECTRA objective. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`) Prediction scores of the head (scores for each token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraTokenizer, ElectraForPreTraining import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) discriminator_sequence_output = discriminator_hidden_states[0] logits = self.discriminator_predictions(discriminator_sequence_output, attention_mask) output = (logits,) if labels is not None: loss_fct = nn.BCEWithLogitsLoss() if attention_mask is not None: active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1 active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss] active_labels = labels[active_loss] loss = loss_fct(active_logits, active_labels.float()) else: loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) output = (loss,) + output output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a language modeling head on top. Even though both the discriminator and generator may be loaded into this model, the generator is the only model of the two to have been trained for the masked language modeling task.""", ELECTRA_START_DOCSTRING, ) class ElectraForMaskedLM(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.electra = ElectraModel(config) self.generator_predictions = ElectraGeneratorPredictions(config) self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) self.init_weights() def get_output_embeddings(self): return self.generator_lm_head @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraTokenizer, ElectraForMaskedLM import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ generator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) generator_sequence_output = generator_hidden_states[0] prediction_scores = self.generator_predictions(generator_sequence_output) prediction_scores = self.generator_lm_head(prediction_scores) output = (prediction_scores,) # Masked language modeling softmax layer if masked_lm_labels is not None: loss_fct = nn.CrossEntropyLoss() # -100 index = padding token loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) output = (loss,) + output output += generator_hidden_states[1:] return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a token classification head on top. Both the discriminator and generator may be loaded into this model.""", ELECTRA_START_DOCSTRING, ) class ElectraForTokenClassification(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.electra = ElectraModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraTokenizer, ElectraForTokenClassification import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) output = (logits,) if labels is not None: loss_fct = nn.CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.config.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) output = (loss,) + output output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_encoder_decoder.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Classes to support Encoder-Decoder architectures """ import logging from typing import Optional from .configuration_encoder_decoder import EncoderDecoderConfig from .configuration_utils import PretrainedConfig from .modeling_utils import PreTrainedModel logger = logging.getLogger(__name__) class EncoderDecoderModel(PreTrainedModel): r""" :class:`~transformers1.EncoderDecoder` is a generic model class that will be instantiated as a transformer architecture with one of the base model classes of the library as encoder and another one as decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class method for the encoder and `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` class method for the decoder. """ config_class = EncoderDecoderConfig base_model_prefix = "encoder_decoder" def __init__( self, config: Optional[PretrainedConfig] = None, encoder: Optional[PreTrainedModel] = None, decoder: Optional[PreTrainedModel] = None, ): assert config is not None or ( encoder is not None and decoder is not None ), "Either a configuration or an Encoder and a decoder has to be provided" if config is None: config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config) else: assert isinstance(config, self.config_class), "config: {} has to be of type {}".format( config, self.config_class ) # initialize with config super().__init__(config) if encoder is None: from transformers import AutoModel encoder = AutoModel.from_config(config.encoder) if decoder is None: from transformers import AutoModelWithLMHead decoder = AutoModelWithLMHead.from_config(config.decoder) self.encoder = encoder self.decoder = decoder assert ( self.encoder.get_output_embeddings() is None ), "The encoder {} should not have a LM Head. Please use a model without LM Head" def tie_weights(self): # for now no weights tying in encoder-decoder pass def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder def get_input_embeddings(self): return self.encoder.get_input_embeddings() def get_output_embeddings(self): return self.decoder.get_output_embeddings() @classmethod def from_encoder_decoder_pretrained( cls, encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, **kwargs ) -> PreTrainedModel: r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints. The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train the model, you need to first set it back in training mode with `model.train()`. Params: encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): information necessary to initiate the encoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): information necessary to initiate the decoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method kwargs: (`optional`) Remaining dictionary of keyword arguments. Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Examples:: from transformers1 import EncoderDecoder model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert """ kwargs_encoder = { argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") } kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } # Load and initialize the encoder and decoder # The distinction between encoder and decoder at the model level is made # by the value of the flag `is_decoder` that we need to set correctly. encoder = kwargs_encoder.pop("model", None) if encoder is None: assert ( encoder_pretrained_model_name_or_path is not None ), "If `model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has to be defined" from .modeling_auto import AutoModel encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) encoder.config.is_decoder = False decoder = kwargs_decoder.pop("model", None) if decoder is None: assert ( decoder_pretrained_model_name_or_path is not None ), "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has to be defined" from .modeling_auto import AutoModelWithLMHead if "config" not in kwargs_decoder: from transformers import AutoConfig decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path) if decoder_config.is_decoder is False: logger.info( f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." ) decoder_config.is_decoder = True kwargs_decoder["config"] = decoder_config if kwargs_decoder["config"].is_decoder is False: logger.warning( f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, make sure that the attribute `is_decoder` of `decoder_config` passed to `.from_encoder_decoder_pretrained(...)` is set to `True` or do not pass a `decoder_config` to `.from_encoder_decoder_pretrained(...)`" ) decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) return cls(encoder=encoder, decoder=decoder) def forward( self, input_ids=None, inputs_embeds=None, attention_mask=None, head_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_head_mask=None, decoder_inputs_embeds=None, masked_lm_labels=None, lm_labels=None, **kwargs, ): """ Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary for the encoder. Indices can be obtained using :class:`transformers1.PretrainedTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices for the encoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. head_mask: (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules for the encoder. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for sequence to sequence training to the decoder. Indices can be obtained using :class:`transformers1.PretrainedTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. decoder_head_mask: (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules for the decoder. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss for the decoder. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction) for the decoder. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` kwargs: (`optional`) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors: - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function. - With a `decoder_` prefix which will be input as `**decoder_kwargs` for the decoder forward function. Examples:: from transformers1 import EncoderDecoderModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert # forward input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) # training loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2] # generation generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) """ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, **kwargs_encoder, ) hidden_states = encoder_outputs[0] # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, inputs_embeds=decoder_inputs_embeds, attention_mask=decoder_attention_mask, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, lm_labels=lm_labels, masked_lm_labels=masked_lm_labels, **kwargs_decoder, ) return decoder_outputs + encoder_outputs def prepare_inputs_for_generation(self, input_ids, past, attention_mask, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step if type(past) is tuple: encoder_outputs = past else: encoder_outputs = (past,) decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids) return { "attention_mask": attention_mask, "decoder_attention_mask": decoder_inputs["attention_mask"], "decoder_input_ids": decoder_inputs["input_ids"], "encoder_outputs": encoder_outputs, } def _reorder_cache(self, past, beam_idx): # as a default encoder-decoder models do not re-order the past. # TODO(PVP): might have to be updated, e.g. if GPT2 is to be used as a decoder return past ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Flaubert model, based on XLM. """ import logging import random import torch from torch.nn import functional as F from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_xlm import ( XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, XLMForSequenceClassification, XLMModel, XLMWithLMHeadModel, get_masks, ) logger = logging.getLogger(__name__) FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "flaubert/flaubert_small_cased", "flaubert/flaubert_base_uncased", "flaubert/flaubert_base_cased", "flaubert/flaubert_large_cased", # See all Flaubert models at https://huggingface.co/models?filter=flaubert ] FLAUBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.FlaubertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ FLAUBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`): dictionary with ``torch.FloatTensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", FLAUBERT_START_DOCSTRING, ) class FlaubertModel(XLMModel): config_class = FlaubertConfig def __init__(self, config): # , dico, is_encoder, with_output): super().__init__(config) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import FlaubertTokenizer, FlaubertModel import torch tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') model = FlaubertModel.from_pretrained('flaubert-base-cased') input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # removed: src_enc=None, src_len=None if input_ids is not None: bs, slen = input_ids.size() else: bs, slen = inputs_embeds.size()[:-1] if lengths is None: if input_ids is not None: lengths = (input_ids != self.pad_index).sum(dim=1).long() else: lengths = torch.LongTensor([slen] * bs) # mask = input_ids != self.pad_index # check inputs assert lengths.size(0) == bs assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] device = input_ids.device if input_ids is not None else inputs_embeds.device # position_ids if position_ids is None: position_ids = torch.arange(slen, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand((bs, slen)) else: assert position_ids.size() == (bs, slen) # (slen, bs) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: assert langs.size() == (bs, slen) # (slen, bs) # langs = langs.transpose(0, 1) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layers) # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds) if langs is not None and self.use_lang_emb and self.config.n_langs > 1: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = F.dropout(tensor, p=self.dropout, training=self.training) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): # LayerDrop dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): continue if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention if not self.pre_norm: attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i]) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) else: tensor_normalized = self.layer_norm1[i](tensor) attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i]) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN if not self.pre_norm: tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) else: tensor_normalized = self.layer_norm2[i](tensor) tensor = tensor + self.ffns[i](tensor_normalized) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) @add_start_docstrings( """The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, FLAUBERT_START_DOCSTRING, ) class FlaubertWithLMHeadModel(XLMWithLMHeadModel): """ This class overrides :class:`~transformers1.XLMWithLMHeadModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() @add_start_docstrings( """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, FLAUBERT_START_DOCSTRING, ) class FlaubertForSequenceClassification(XLMForSequenceClassification): """ This class overrides :class:`~transformers1.XLMForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() @add_start_docstrings( """Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, FLAUBERT_START_DOCSTRING, ) class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple): """ This class overrides :class:`~transformers1.XLMForQuestionAnsweringSimple`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() @add_start_docstrings( """Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, FLAUBERT_START_DOCSTRING, ) class FlaubertForQuestionAnswering(XLMForQuestionAnswering): """ This class overrides :class:`~transformers1.XLMForQuestionAnswering`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch OpenAI GPT-2 model.""" import logging import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .activations import ACT2FN from .configuration_gpt2 import GPT2Config from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer logger = logging.getLogger(__name__) GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "distilgpt2", # See all GPT-2 models at https://huggingface.co/models?filter=gpt2 ] def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): """ Load tf checkpoints in a pytorch model """ try: import re import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(gpt2_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array.squeeze()) for name, array in zip(names, arrays): name = name[6:] # skip "model/" name = name.split("/") pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+\d+", m_name): scope_names = re.split(r"(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "w" or scope_names[0] == "g": pointer = getattr(pointer, "weight") elif scope_names[0] == "b": pointer = getattr(pointer, "bias") elif scope_names[0] == "wpe" or scope_names[0] == "wte": pointer = getattr(pointer, scope_names[0]) pointer = getattr(pointer, "weight") else: pointer = getattr(pointer, scope_names[0]) if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): super().__init__() self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.register_buffer( "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx) ) self.register_buffer("masked_bias", torch.tensor(-1e4)) self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = Conv1D(n_state * 3, nx) self.c_proj = Conv1D(n_state, nx) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.n_head, self.split_size // self.n_head) heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) # Prune conv1d layers self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) # Update hyper params self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) self.n_head = self.n_head - len(heads) self.pruned_heads = self.pruned_heads.union(heads) def _attn(self, q, k, v, attention_mask=None, head_mask=None): w = torch.matmul(q, k) if self.scale: w = w / (float(v.size(-1)) ** 0.5) nd, ns = w.size(-2), w.size(-1) mask = self.bias[:, :, ns - nd : ns, :ns] w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype)) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = nn.Softmax(dim=-1)(w) w = self.attn_dropout(w) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [torch.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = x.permute(0, 2, 1, 3).contiguous() new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states def split_heads(self, x, k=False): new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states if k: return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length) else: return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) def forward(self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): x = self.c_attn(x) query, key, value = x.split(self.split_size, dim=2) query = self.split_heads(query) key = self.split_heads(key, k=True) value = self.split_heads(value) if layer_past is not None: past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below key = torch.cat((past_key, key), dim=-1) value = torch.cat((past_value, value), dim=-2) if use_cache is True: present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking else: present = (None,) attn_outputs = self._attn(query, key, value, attention_mask, head_mask) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a) outputs = [a, present] + attn_outputs[1:] return outputs # a, present, (attentions) class MLP(nn.Module): def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.act = ACT2FN[config.activation_function] self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, x): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) return self.dropout(h2) class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super().__init__() nx = config.n_embd self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) def forward(self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): output_attn = self.attn( self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) a = output_attn[0] # output_attn: a, present, (attentions) x = x + a m = self.mlp(self.ln_2(x)) x = x + m outputs = [x] + output_attn[1:] return outputs # x, present, (attentions) class GPT2PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = GPT2Config load_tf_weights = load_tf_weights_in_gpt2 base_model_prefix = "transformer" def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) GPT2_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.GPT2Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ GPT2_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only `input_ids` that do not have their past calculated should be passed as `input_ids`. Indices can be obtained using :class:`transformers1.GPT2Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The `input_ids` which have their past given to this model should not be passed as `input_ids` as they have already been computed. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`, defaults to :obj:`None`): `input_ids_length` = `sequence_length if `past` is None else 1 Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If `past` is used, optionally only the last `inputs_embeds` have to be input (see `past`). use_cache (:obj:`bool`): If `use_cache` is True, `past` key value states are returned and can be used to speed up decoding (see `past`). Defaults to `True`. """ @add_start_docstrings( "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", GPT2_START_DOCSTRING, ) class GPT2Model(GPT2PreTrainedModel): def __init__(self, config): super().__init__(config) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.init_weights() def get_input_embeddings(self): return self.wte def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. If `past` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import GPT2Tokenizer, GPT2Model import torch tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = past[0][0].size(-2) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # head_mask has shape n_layer x batch x n_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.n_layer) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) presents = () all_attentions = [] all_hidden_states = () for i, (block, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], use_cache=use_cache, ) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.ln_f(hidden_states) hidden_states = hidden_states.view(*output_shape) # Add last hidden state if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs # last hidden state, (presents), (all hidden_states), (attentions) @add_start_docstrings( """The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, ) class GPT2LMHeadModel(GPT2PreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights() def get_output_embeddings(self): return self.lm_head def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: input_ids = input_ids[:, -1].unsqueeze(-1) return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=True, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import GPT2Tokenizer, GPT2LMHeadModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) @add_start_docstrings( """The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, GPT2_START_DOCSTRING, ) class GPT2DoubleHeadsModel(GPT2PreTrainedModel): def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.init_weights() def get_output_embeddings(self): return self.lm_head @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, lm_labels=None, mc_labels=None, use_cache=True, ): r""" mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`) Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): Language modeling loss. mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): Multiple choice classification loss. lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import GPT2Tokenizer, GPT2DoubleHeadsModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') # Add a [CLS] to the vocabulary (we should train it also!) tokenizer.add_special_tokens({'cls_token': '[CLS]'}) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] encoded_choices = [tokenizer.encode(s) for s in choices] cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_longformer.py ================================================ # coding=utf-8 # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch Longformer model. """ import logging import math import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F from .configuration_longformer import LongformerConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import BertPreTrainedModel from .modeling_roberta import RobertaLMHead, RobertaModel logger = logging.getLogger(__name__) LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "allenai/longformer-base-4096", "allenai/longformer-large-4096", "allenai/longformer-large-4096-finetuned-triviaqa", "allenai/longformer-base-4096-extra.pos.embd.only", "allenai/longformer-large-4096-extra.pos.embd.only", # See all Longformer models at https://huggingface.co/models?filter=longformer ] def _get_question_end_index(input_ids, sep_token_id): """ Computes the index of the first occurance of `sep_token_id`. """ sep_token_indices = (input_ids == sep_token_id).nonzero() batch_size = input_ids.shape[0] assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions" assert ( sep_token_indices.shape[0] == 3 * batch_size ), f"There should be exactly three separator tokens: {sep_token_id} in every sample for questions answering. You might also consider to set `global_attention_mask` manually in the forward function to avoid this error." return sep_token_indices.view(batch_size, 3, 2)[:, 0, 1] def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True): """ Computes global attention mask by putting attention on all tokens before `sep_token_id` if `before_sep_token is True` else after `sep_token_id`. """ question_end_index = _get_question_end_index(input_ids, sep_token_id) question_end_index = question_end_index.unsqueeze(dim=1) # size: batch_size x 1 # bool attention mask with True in locations of global attention attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device) if before_sep_token is True: attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.uint8) else: # last token is separation token and should not be counted and in the middle are two separation tokens attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.uint8) * ( attention_mask.expand_as(input_ids) < input_ids.shape[-1] ).to(torch.uint8) return attention_mask class LongformerSelfAttention(nn.Module): def __init__(self, config, layer_id): super().__init__() if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_heads = config.num_attention_heads self.head_dim = int(config.hidden_size / config.num_attention_heads) self.embed_dim = config.hidden_size self.query = nn.Linear(config.hidden_size, self.embed_dim) self.key = nn.Linear(config.hidden_size, self.embed_dim) self.value = nn.Linear(config.hidden_size, self.embed_dim) # separate projection layers for tokens with global attention self.query_global = nn.Linear(config.hidden_size, self.embed_dim) self.key_global = nn.Linear(config.hidden_size, self.embed_dim) self.value_global = nn.Linear(config.hidden_size, self.embed_dim) self.dropout = config.attention_probs_dropout_prob self.layer_id = layer_id attention_window = config.attention_window[self.layer_id] assert ( attention_window % 2 == 0 ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}" assert ( attention_window > 0 ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}" self.one_sided_attention_window_size = attention_window // 2 @staticmethod def _skew(x, direction): """Convert diagonals into columns (or columns into diagonals depending on `direction`""" x_padded = F.pad(x, direction) # padding value is not important because it will be overwritten x_padded = x_padded.view(*x_padded.size()[:-2], x_padded.size(-1), x_padded.size(-2)) return x_padded @staticmethod def _skew2(x): """shift every row 1 step to right converting columns into diagonals""" # X = B x C x M x L B, C, M, L = x.size() x = F.pad(x, (0, M + 1)) # B x C x M x (L+M+1). Padding value is not important because it'll be overwritten x = x.view(B, C, -1) # B x C x ML+MM+M x = x[:, :, :-M] # B x C x ML+MM x = x.view(B, C, M, M + L) # B x C, M x L+M x = x[:, :, :, :-1] return x @staticmethod def _chunk(x, w): """convert into overlapping chunkings. Chunk size = 2w, overlap size = w""" # non-overlapping chunks of size = 2w x = x.view(x.size(0), x.size(1) // (w * 2), w * 2, x.size(2)) # use `as_strided` to make the chunks overlap with an overlap size = w chunk_size = list(x.size()) chunk_size[1] = chunk_size[1] * 2 - 1 chunk_stride = list(x.stride()) chunk_stride[1] = chunk_stride[1] // 2 return x.as_strided(size=chunk_size, stride=chunk_stride) def _mask_invalid_locations(self, input_tensor, w) -> torch.Tensor: affected_seqlen = w beginning_mask_2d = input_tensor.new_ones(w, w + 1).tril().flip(dims=[0]) beginning_mask = beginning_mask_2d[None, :, None, :] ending_mask = beginning_mask.flip(dims=(1, 3)) seqlen = input_tensor.size(1) beginning_input = input_tensor[:, :affected_seqlen, :, : w + 1] beginning_mask = beginning_mask[:, :seqlen].expand(beginning_input.size()) beginning_input.masked_fill_(beginning_mask == 1, -float("inf")) # `== 1` converts to bool or uint8 ending_input = input_tensor[:, -affected_seqlen:, :, -(w + 1) :] ending_mask = ending_mask[:, -seqlen:].expand(ending_input.size()) ending_input.masked_fill_(ending_mask == 1, -float("inf")) # `== 1` converts to bool or uint8 def _sliding_chunks_matmul_qk(self, q: torch.Tensor, k: torch.Tensor, w: int): """Matrix multiplicatio of query x key tensors using with a sliding window attention pattern. This implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an overlap of size w""" batch_size, seqlen, num_heads, head_dim = q.size() assert seqlen % (w * 2) == 0, f"Sequence length should be multiple of {w * 2}. Given {seqlen}" assert q.size() == k.size() chunks_count = seqlen // w - 1 # group batch_size and num_heads dimensions into one, then chunk seqlen into chunks of size w * 2 q = q.transpose(1, 2).reshape(batch_size * num_heads, seqlen, head_dim) k = k.transpose(1, 2).reshape(batch_size * num_heads, seqlen, head_dim) chunk_q = self._chunk(q, w) chunk_k = self._chunk(k, w) # matrix multipication # bcxd: batch_size * num_heads x chunks x 2w x head_dim # bcyd: batch_size * num_heads x chunks x 2w x head_dim # bcxy: batch_size * num_heads x chunks x 2w x 2w chunk_attn = torch.einsum("bcxd,bcyd->bcxy", (chunk_q, chunk_k)) # multiply # convert diagonals into columns diagonal_chunk_attn = self._skew(chunk_attn, direction=(0, 0, 0, 1)) # allocate space for the overall attention matrix where the chunks are compined. The last dimension # has (w * 2 + 1) columns. The first (w) columns are the w lower triangles (attention from a word to # w previous words). The following column is attention score from each word to itself, then # followed by w columns for the upper triangle. diagonal_attn = diagonal_chunk_attn.new_empty((batch_size * num_heads, chunks_count + 1, w, w * 2 + 1)) # copy parts from diagonal_chunk_attn into the compined matrix of attentions # - copying the main diagonal and the upper triangle diagonal_attn[:, :-1, :, w:] = diagonal_chunk_attn[:, :, :w, : w + 1] diagonal_attn[:, -1, :, w:] = diagonal_chunk_attn[:, -1, w:, : w + 1] # - copying the lower triangle diagonal_attn[:, 1:, :, :w] = diagonal_chunk_attn[:, :, -(w + 1) : -1, w + 1 :] diagonal_attn[:, 0, 1:w, 1:w] = diagonal_chunk_attn[:, 0, : w - 1, 1 - w :] # separate batch_size and num_heads dimensions again diagonal_attn = diagonal_attn.view(batch_size, num_heads, seqlen, 2 * w + 1).transpose(2, 1) self._mask_invalid_locations(diagonal_attn, w) return diagonal_attn def _sliding_chunks_matmul_pv(self, prob: torch.Tensor, v: torch.Tensor, w: int): """Same as _sliding_chunks_matmul_qk but for prob and value tensors. It is expecting the same output format from _sliding_chunks_matmul_qk""" batch_size, seqlen, num_heads, head_dim = v.size() assert seqlen % (w * 2) == 0 assert prob.size()[:3] == v.size()[:3] assert prob.size(3) == 2 * w + 1 chunks_count = seqlen // w - 1 # group batch_size and num_heads dimensions into one, then chunk seqlen into chunks of size 2w chunk_prob = prob.transpose(1, 2).reshape(batch_size * num_heads, seqlen // w, w, 2 * w + 1) # group batch_size and num_heads dimensions into one v = v.transpose(1, 2).reshape(batch_size * num_heads, seqlen, head_dim) # pad seqlen with w at the beginning of the sequence and another w at the end padded_v = F.pad(v, (0, 0, w, w), value=-1) # chunk padded_v into chunks of size 3w and an overlap of size w chunk_v_size = (batch_size * num_heads, chunks_count + 1, 3 * w, head_dim) chunk_v_stride = padded_v.stride() chunk_v_stride = chunk_v_stride[0], w * chunk_v_stride[1], chunk_v_stride[1], chunk_v_stride[2] chunk_v = padded_v.as_strided(size=chunk_v_size, stride=chunk_v_stride) skewed_prob = self._skew2(chunk_prob) context = torch.einsum("bcwd,bcdh->bcwh", (skewed_prob, chunk_v)) return context.view(batch_size, num_heads, seqlen, head_dim).transpose(1, 2) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): """ LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`. Padding to `attention_window` happens in LongformerModel.forward to avoid redoing the padding on each layer. The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to -ve: no attention 0: local attention +ve: global attention `encoder_hidden_states` and `encoder_attention_mask` are not supported and should be None """ # TODO: add support for `encoder_hidden_states` and `encoder_attention_mask` assert encoder_hidden_states is None, "`encoder_hidden_states` is not supported and should be None" assert encoder_attention_mask is None, "`encoder_attention_mask` is not supported and shiould be None" if attention_mask is not None: attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1) key_padding_mask = attention_mask < 0 extra_attention_mask = attention_mask > 0 remove_from_windowed_attention_mask = attention_mask != 0 num_extra_indices_per_batch = extra_attention_mask.long().sum(dim=1) max_num_extra_indices_per_batch = num_extra_indices_per_batch.max() if max_num_extra_indices_per_batch <= 0: extra_attention_mask = None else: # To support the case of variable number of global attention in the rows of a batch, # we use the following three selection masks to select global attention embeddings # in a 3d tensor and pad it to `max_num_extra_indices_per_batch` # 1) selecting embeddings that correspond to global attention extra_attention_mask_nonzeros = extra_attention_mask.nonzero(as_tuple=True) zero_to_max_range = torch.arange( 0, max_num_extra_indices_per_batch, device=num_extra_indices_per_batch.device ) # mask indicating which values are actually going to be padding selection_padding_mask = zero_to_max_range < num_extra_indices_per_batch.unsqueeze(dim=-1) # 2) location of the non-padding values in the selected global attention selection_padding_mask_nonzeros = selection_padding_mask.nonzero(as_tuple=True) # 3) location of the padding values in the selected global attention selection_padding_mask_zeros = (selection_padding_mask == 0).nonzero(as_tuple=True) else: remove_from_windowed_attention_mask = None extra_attention_mask = None key_padding_mask = None hidden_states = hidden_states.transpose(0, 1) seqlen, batch_size, embed_dim = hidden_states.size() assert embed_dim == self.embed_dim q = self.query(hidden_states) k = self.key(hidden_states) v = self.value(hidden_states) q /= math.sqrt(self.head_dim) q = q.view(seqlen, batch_size, self.num_heads, self.head_dim).transpose(0, 1) k = k.view(seqlen, batch_size, self.num_heads, self.head_dim).transpose(0, 1) # attn_weights = (batch_size, seqlen, num_heads, window*2+1) attn_weights = self._sliding_chunks_matmul_qk(q, k, self.one_sided_attention_window_size) self._mask_invalid_locations(attn_weights, self.one_sided_attention_window_size) if remove_from_windowed_attention_mask is not None: # This implementation is fast and takes very little memory because num_heads x hidden_size = 1 # from (batch_size x seqlen) to (batch_size x seqlen x num_heads x hidden_size) remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1).unsqueeze( dim=-1 ) # cast to fp32/fp16 then replace 1's with -inf float_mask = remove_from_windowed_attention_mask.type_as(q).masked_fill( remove_from_windowed_attention_mask, -10000.0 ) ones = float_mask.new_ones(size=float_mask.size()) # tensor of ones # diagonal mask with zeros everywhere and -inf inplace of padding d_mask = self._sliding_chunks_matmul_qk(ones, float_mask, self.one_sided_attention_window_size) attn_weights += d_mask assert list(attn_weights.size()) == [ batch_size, seqlen, self.num_heads, self.one_sided_attention_window_size * 2 + 1, ] # the extra attention if extra_attention_mask is not None: selected_k = k.new_zeros(batch_size, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) selected_k[selection_padding_mask_nonzeros] = k[extra_attention_mask_nonzeros] # (batch_size, seqlen, num_heads, max_num_extra_indices_per_batch) selected_attn_weights = torch.einsum("blhd,bshd->blhs", (q, selected_k)) selected_attn_weights[selection_padding_mask_zeros[0], :, :, selection_padding_mask_zeros[1]] = -10000 # concat to attn_weights # (batch_size, seqlen, num_heads, extra attention count + 2*window+1) attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1) attn_weights_fp32 = F.softmax(attn_weights, dim=-1, dtype=torch.float32) # use fp32 for numerical stability attn_weights = attn_weights_fp32.type_as(attn_weights) if key_padding_mask is not None: # softmax sometimes inserts NaN if all positions are masked, replace them with 0 attn_weights = torch.masked_fill(attn_weights, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0) attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) v = v.view(seqlen, batch_size, self.num_heads, self.head_dim).transpose(0, 1) attn = None if extra_attention_mask is not None: selected_attn_probs = attn_probs.narrow(-1, 0, max_num_extra_indices_per_batch) selected_v = v.new_zeros(batch_size, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) selected_v[selection_padding_mask_nonzeros] = v[extra_attention_mask_nonzeros] # use `matmul` because `einsum` crashes sometimes with fp16 # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v)) attn = torch.matmul(selected_attn_probs.transpose(1, 2), selected_v.transpose(1, 2)).transpose(1, 2) attn_probs = attn_probs.narrow( -1, max_num_extra_indices_per_batch, attn_probs.size(-1) - max_num_extra_indices_per_batch ).contiguous() if attn is None: attn = self._sliding_chunks_matmul_pv(attn_probs, v, self.one_sided_attention_window_size) else: attn += self._sliding_chunks_matmul_pv(attn_probs, v, self.one_sided_attention_window_size) assert attn.size() == (batch_size, seqlen, self.num_heads, self.head_dim), "Unexpected size" attn = attn.transpose(0, 1).reshape(seqlen, batch_size, embed_dim).contiguous() # For this case, we'll just recompute the attention for these indices # and overwrite the attn tensor. # TODO: remove the redundant computation if extra_attention_mask is not None: selected_hidden_states = hidden_states.new_zeros(max_num_extra_indices_per_batch, batch_size, embed_dim) selected_hidden_states[selection_padding_mask_nonzeros[::-1]] = hidden_states[ extra_attention_mask_nonzeros[::-1] ] q = self.query_global(selected_hidden_states) k = self.key_global(hidden_states) v = self.value_global(hidden_states) q /= math.sqrt(self.head_dim) q = ( q.contiguous() .view(max_num_extra_indices_per_batch, batch_size * self.num_heads, self.head_dim) .transpose(0, 1) ) # (batch_size * self.num_heads, max_num_extra_indices_per_batch, head_dim) k = ( k.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1) ) # batch_size * self.num_heads, seqlen, head_dim) v = ( v.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1) ) # batch_size * self.num_heads, seqlen, head_dim) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_weights.size()) == [batch_size * self.num_heads, max_num_extra_indices_per_batch, seqlen] attn_weights = attn_weights.view(batch_size, self.num_heads, max_num_extra_indices_per_batch, seqlen) attn_weights[selection_padding_mask_zeros[0], :, selection_padding_mask_zeros[1], :] = -10000.0 if key_padding_mask is not None: attn_weights = attn_weights.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), -10000.0,) attn_weights = attn_weights.view(batch_size * self.num_heads, max_num_extra_indices_per_batch, seqlen) attn_weights_float = F.softmax( attn_weights, dim=-1, dtype=torch.float32 ) # use fp32 for numerical stability attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) selected_attn = torch.bmm(attn_probs, v) assert list(selected_attn.size()) == [ batch_size * self.num_heads, max_num_extra_indices_per_batch, self.head_dim, ] selected_attn_4d = selected_attn.view( batch_size, self.num_heads, max_num_extra_indices_per_batch, self.head_dim ) nonzero_selected_attn = selected_attn_4d[ selection_padding_mask_nonzeros[0], :, selection_padding_mask_nonzeros[1] ] attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view( len(selection_padding_mask_nonzeros[0]), -1 ) context_layer = attn.transpose(0, 1) if self.output_attentions: if extra_attention_mask is not None: # With global attention, return global attention probabilities only # batch_size x num_heads x max_num_global_attention_tokens x sequence_length # which is the attention weights from tokens with global attention to all tokens # It doesn't not return local attention # In case of variable number of global attantion in the rows of a batch, # attn_weights are padded with -10000.0 attention scores attn_weights = attn_weights.view(batch_size, self.num_heads, max_num_extra_indices_per_batch, seqlen) else: # without global attention, return local attention probabilities # batch_size x num_heads x sequence_length x window_size # which is the attention weights of every token attending to its neighbours attn_weights = attn_weights.permute(0, 2, 1, 3) outputs = (context_layer, attn_weights) if self.output_attentions else (context_layer,) return outputs LONGFORMER_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.LongformerConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ LONGFORMER_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.LonmgformerTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to decide the attention given on each token, local attention or global attenion. Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is important for task-specific finetuning because it makes the model more flexible at representing the task. For example, for classification, the token should be given global attention. For QA, all question tokens should also have global attention. Please refer to the Longformer paper https://arxiv.org/abs/2004.05150 for more details. Mask values selected in ``[0, 1]``: ``0`` for local attention (a sliding window attention), ``1`` for global attention (tokens that attend to all other tokens, and all other tokens attend to them). token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Longformer Model outputting raw hidden-states without any specific head on top.", LONGFORMER_START_DOCSTRING, ) class LongformerModel(RobertaModel): """ This class overrides :class:`~transformers1.RobertaModel` to provide the ability to process long sequences following the selfattention approach described in `Longformer: the Long-Document Transformer`_by Iz Beltagy, Matthew E. Peters, and Arman Cohan. Longformer selfattention combines a local (sliding window) and global attention to extend to long documents without the O(n^2) increase in memory and compute. The selfattention module `LongformerSelfAttention` implemented here supports the combination of local and global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and dilated attention are more relevant for autoregressive language modeling than finetuning on downstream tasks. Future release will add support for autoregressive attention, but the support for dilated attention requires a custom CUDA kernel to be memory and compute efficient. .. _`Longformer: the Long-Document Transformer`: https://arxiv.org/abs/2004.05150 """ config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) if isinstance(config.attention_window, int): assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value" assert config.attention_window > 0, "`config.attention_window` has to be positive" config.attention_window = [config.attention_window] * config.num_hidden_layers # one value per layer else: assert len(config.attention_window) == config.num_hidden_layers, ( "`len(config.attention_window)` should equal `config.num_hidden_layers`. " f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}" ) for i, layer in enumerate(self.encoder.layer): # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` layer.attention.self = LongformerSelfAttention(config, layer_id=i) self.init_weights() def _pad_to_window_size( self, input_ids: torch.Tensor, attention_mask: torch.Tensor, token_type_ids: torch.Tensor, position_ids: torch.Tensor, inputs_embeds: torch.Tensor, attention_window: int, pad_token_id: int, ): """A helper function to pad tokens and mask to work with implementation of Longformer selfattention.""" assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}" input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape batch_size, seqlen = input_shape[:2] padding_len = (attention_window - seqlen % attention_window) % attention_window if padding_len > 0: logger.info( "Input ids are automatically padded from {} to {} to be a multiple of `config.attention_window`: {}".format( seqlen, seqlen + padding_len, attention_window ) ) if input_ids is not None: input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) if attention_mask is not None: attention_mask = F.pad( attention_mask, (0, padding_len), value=False ) # no attention on the padding tokens if token_type_ids is not None: token_type_ids = F.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 if position_ids is not None: # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id) if inputs_embeds is not None: input_ids_padding = inputs_embeds.new_full( (batch_size, padding_len), self.config.pad_token_id, dtype=torch.long, ) inputs_embeds_padding = self.embeddings(input_ids_padding) inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, masked_lm_labels=None, ): r""" Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import LongformerModel, LongformerTokenizer model = LongformerModel.from_pretrained('allenai/longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example, # classification: the token # QA: question tokens # LM: potentially on the beginning of sentences and paragraphs sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask) """ # padding attention_window = ( self.config.attention_window if isinstance(self.config.attention_window, int) else max(self.config.attention_window) ) # merge `global_attention_mask` and `attention_mask` if global_attention_mask is not None: # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) # (global_attention_mask + 1) => 1 for local attention, 2 for global attention # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention if attention_mask is not None: attention_mask = attention_mask * (global_attention_mask + 1) else: # simply use `global_attention_mask` as `attention_mask` # if no `attention_mask` is given attention_mask = global_attention_mask + 1 padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds = self._pad_to_window_size( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, attention_window=attention_window, pad_token_id=self.config.pad_token_id, ) # embed output = super().forward( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=None, inputs_embeds=inputs_embeds, encoder_hidden_states=None, encoder_attention_mask=None, ) # undo padding if padding_len > 0: # `output` has the following tensors: sequence_output, pooled_output, (hidden_states), (attentions) # `sequence_output`: unpad because the calling function is expecting a length == input_ids.size(1) # `pooled_output`: independent of the sequence length # `hidden_states`: mainly used for debugging and analysis, so keep the padding # `attentions`: mainly used for debugging and analysis, so keep the padding output = output[0][:, :-padding_len], *output[1:] return output @add_start_docstrings("""Longformer Model with a `language modeling` head on top. """, LONGFORMER_START_DOCSTRING) class LongformerForMaskedLM(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.longformer = LongformerModel(config) self.lm_head = RobertaLMHead(config) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import LongformerForMaskedLM, LongformerTokenizer model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM # check ``LongformerModel.forward`` for more details how to set `attention_mask` loss, prediction_scores = model(input_ids, attention_mask=attention_mask, masked_lm_labels=input_ids) """ outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """Longformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, LONGFORMER_START_DOCSTRING, ) class LongformerForSequenceClassification(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.longformer = LongformerModel(config) self.classifier = LongformerClassificationHead(config) @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.LongformerConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForSequenceClassification import torch tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ if global_attention_mask is None: logger.info("Initializing global attention on CLS token...") global_attention_mask = torch.zeros_like(input_ids) # global attention on cls token global_attention_mask[:, 0] = 1 outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) class LongformerClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, hidden_states, **kwargs): hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) hidden_states = self.dropout(hidden_states) hidden_states = self.dense(hidden_states) hidden_states = torch.tanh(hidden_states) hidden_states = self.dropout(hidden_states) output = self.out_proj(hidden_states) return output @add_start_docstrings( """Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD / TriviaQA (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, LONGFORMER_START_DOCSTRING, ) class LongformerForQuestionAnswering(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.longformer = LongformerModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.LongformerConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForQuestionAnswering import torch tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text, return_tensors="pt") input_ids = encoding["input_ids"] # default is local attention everywhere # the forward method will automatically set global attention on question tokens attention_mask = encoding["attention_mask"] start_scores, end_scores = model(input_ids, attention_mask=attention_mask) all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist()) answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1] answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token """ # set global attention on question tokens if global_attention_mask is None: logger.info("Initializing global attention on question tokens...") # put global attention on all tokens until `config.sep_token_id` is reached global_attention_mask = _compute_global_attention_mask(input_ids, self.config.sep_token_id) outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) @add_start_docstrings( """Longformer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, LONGFORMER_START_DOCSTRING, ) class LongformerForTokenClassification(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.longformer = LongformerModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.LongformerConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForTokenClassification import torch tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Longformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, LONGFORMER_START_DOCSTRING, ) class LongformerForMultipleChoice(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.longformer = LongformerModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, global_attention_mask=None, labels=None, position_ids=None, inputs_embeds=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForMultipleChoice import torch tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096') # context = "The dog is cute" | choice = "the dog" / "the cat" choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")] input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 # global attention is automatically put on "the dog" and "the cat" outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] # set global attention on question tokens if global_attention_mask is None: logger.info("Initializing global attention on multiple choice...") # put global attention on all tokens after `config.sep_token_id` global_attention_mask = torch.stack( [ _compute_global_attention_mask(input_ids[:, i], self.config.sep_token_id, before_sep_token=False) for i in range(num_choices) ], dim=1, ) flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None flat_global_attention_mask = ( global_attention_mask.view(-1, global_attention_mask.size(-1)) if global_attention_mask is not None else None ) outputs = self.longformer( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, global_attention_mask=flat_global_attention_mask, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_marian.py ================================================ # coding=utf-8 # Copyright 2020 Marian Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch MarianMTModel model, ported from the Marian C++ repo.""" from .modeling_bart import BartForConditionalGeneration MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all Marian models at https://huggingface.co/models?search=Helsinki-NLP ] class MarianMTModel(BartForConditionalGeneration): r""" Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Model API is identical to BartForConditionalGeneration. Available models are listed at `Model List `__ Examples:: from transformers1 import MarianTokenizer, MarianMTModel from typing import List src = 'fr' # source language trg = 'en' # target language sample_text = "où est l'arrêt de bus ?" mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' model = MarianMTModel.from_pretrained(mname) tok = MarianTokenizer.from_pretrained(mname) batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference gen = model.generate(**batch) # for forward pass: model(**batch) words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?" """ def prepare_logits_for_generation(self, logits, cur_len, max_length): logits[:, self.config.pad_token_id] = float("-inf") if cur_len == max_length - 1 and self.config.eos_token_id is not None: self._force_token_ids_generation(logits, self.config.eos_token_id) return logits ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_mmbt.py ================================================ # coding=utf-8 # Copyright (c) Facebook, Inc. and its affiliates. # Copyright (c) HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch MMBT model. """ import logging import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .file_utils import add_start_docstrings from .modeling_utils import ModuleUtilsMixin logger = logging.getLogger(__name__) class ModalEmbeddings(nn.Module): """Generic Modal Embeddings which takes in an encoder, and a transformer embedding. """ def __init__(self, config, encoder, embeddings): super().__init__() self.config = config self.encoder = encoder self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size) self.position_embeddings = embeddings.position_embeddings self.token_type_embeddings = embeddings.token_type_embeddings self.word_embeddings = embeddings.word_embeddings self.LayerNorm = embeddings.LayerNorm self.dropout = nn.Dropout(p=config.hidden_dropout_prob) def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None): token_embeddings = self.proj_embeddings(self.encoder(input_modal)) seq_length = token_embeddings.size(1) if start_token is not None: start_token_embeds = self.word_embeddings(start_token) seq_length += 1 token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1) if end_token is not None: end_token_embeds = self.word_embeddings(end_token) seq_length += 1 token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1) if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device) position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length) if token_type_ids is None: token_type_ids = torch.zeros( (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device ) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = token_embeddings + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings MMBT_START_DOCSTRING = r""" MMBT model was proposed in `Supervised Multimodal Bitransformers for Classifying Images and Text`_ by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine. It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and obtain state-of-the-art performance on various multimodal classification benchmark tasks. This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. .. _`Supervised Multimodal Bitransformers for Classifying Images and Text`: https://github.com/facebookresearch/mmbt .. _`torch.nn.Module`: https://pytorch.org/docs/stable/nn.html#module Parameters: config (:class:`~transformers1.MMBTConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. transformer (:class: `~nn.Module`): A text transformer that is used by MMBT. It should have embeddings, encoder, and pooler attributes. encoder (:class: `~nn.Module`): Encoder for the second modality. It should take in a batch of modal inputs and return k, n dimension embeddings. """ MMBT_INPUTS_DOCSTRING = r""" Inputs: **input_modal**: ``torch.FloatTensor`` of shape ``(batch_size, ***)``: The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image Encoder, the shape would be (batch_size, channels, height, width) **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's appended to the end of other modality embeddings. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. **modal_start_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for Classification tasks. **modal_end_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used. **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Segment token indices to indicate different portions of the inputs. **modal_token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``: Segment token indices to indicate different portions of the non-text modality. The embeddings from these tokens will be summed with the respective token embeddings for the non-text modality. **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of positions of each input sequence tokens in the position embeddings. **modal_position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``: Indices of positions of each input sequence tokens in the position embeddings for the non-text modality. **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. **encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``: Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. **encoder_attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ @add_start_docstrings( "The bare MMBT Model outputting raw hidden-states without any specific head on top.", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING, ) class MMBTModel(nn.Module, ModuleUtilsMixin): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` Sequence of hidden-states at the output of the last layer of the model. **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**: (`optional`, returned when ``config.output_attentions=True``) list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. transformer = BertModel.from_pretrained('bert-base-uncased') encoder = ImageEncoder(args) mmbt = MMBTModel(config, transformer, encoder) """ def __init__(self, config, transformer, encoder): super().__init__() self.config = config self.transformer = transformer self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings) def forward( self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None, attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None, modal_position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_txt_shape = input_ids.size() elif inputs_embeds is not None: input_txt_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device modal_embeddings = self.modal_encoder( input_modal, start_token=modal_start_tokens, end_token=modal_end_tokens, position_ids=modal_position_ids, token_type_ids=modal_token_type_ids, ) input_modal_shape = modal_embeddings.size()[:-1] if token_type_ids is None: token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device) txt_embeddings = self.transformer.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1) input_shape = embedding_output.size()[:-1] if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) else: attention_mask = torch.cat( [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1 ) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(input_shape, device=device) else: encoder_attention_mask = torch.cat( [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1 ) extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, self.device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) encoder_outputs = self.transformer.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.transformer.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value @add_start_docstrings( """MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)""", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING, ) class MMBTForClassification(nn.Module): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification (or regression if config.num_labels==1) loss. **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax). **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**: (`optional`, returned when ``config.output_attentions=True``) list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. transformer = BertModel.from_pretrained('bert-base-uncased') encoder = ImageEncoder(args) model = MMBTForClassification(config, transformer, encoder) outputs = model(input_modal, input_ids, labels=labels) loss, logits = outputs[:2] """ def __init__(self, config, transformer, encoder): super().__init__() self.num_labels = config.num_labels self.mmbt = MMBTModel(config, transformer, encoder) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) def forward( self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None, attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None, modal_position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): outputs = self.mmbt( input_modal=input_modal, input_ids=input_ids, modal_start_tokens=modal_start_tokens, modal_end_tokens=modal_end_tokens, attention_mask=attention_mask, token_type_ids=token_type_ids, modal_token_type_ids=modal_token_type_ids, position_ids=position_ids, modal_position_ids=modal_position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_openai.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch OpenAI GPT model.""" import json import logging import math import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .activations import gelu_new, swish from .configuration_openai import OpenAIGPTConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer logger = logging.getLogger(__name__) OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "openai-gpt", # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt ] def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here) """ import re import numpy as np if ".ckpt" in openai_checkpoint_folder_path: openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path) logger.info("Loading weights from {}".format(openai_checkpoint_folder_path)) with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle: names = json.load(names_handle) with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle: shapes = json.load(shapes_handle) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] # This was used when we had a single embedding matrix for positions and tokens # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0) # model init_params[1] init_params = [arr.squeeze() for arr in init_params] try: assert model.tokens_embed.weight.shape == init_params[1].shape assert model.positions_embed.weight.shape == init_params[0].shape except AssertionError as e: e.args += (model.tokens_embed.weight.shape, init_params[1].shape) e.args += (model.positions_embed.weight.shape, init_params[0].shape) raise model.tokens_embed.weight.data = torch.from_numpy(init_params[1]) model.positions_embed.weight.data = torch.from_numpy(init_params[0]) names.pop(0) # Pop position and token embedding arrays init_params.pop(0) init_params.pop(0) for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]): name = name[6:] # skip "model/" assert name[-2:] == ":0" name = name[:-2] name = name.split("/") pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+\d+", m_name): scope_names = re.split(r"(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "g": pointer = getattr(pointer, "weight") elif scope_names[0] == "b": pointer = getattr(pointer, "bias") elif scope_names[0] == "w": pointer = getattr(pointer, "weight") else: pointer = getattr(pointer, scope_names[0]) if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new} class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): super().__init__() n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) self.n_head = config.n_head self.split_size = n_state self.scale = scale self.output_attentions = config.output_attentions self.c_attn = Conv1D(n_state * 3, nx) self.c_proj = Conv1D(n_state, nx) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.n_head, self.split_size // self.n_head) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) # Prune conv1d layers self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) # Update hyper params self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) self.n_head = self.n_head - len(heads) self.pruned_heads = self.pruned_heads.union(heads) def _attn(self, q, k, v, attention_mask=None, head_mask=None): w = torch.matmul(q, k) if self.scale: w = w / math.sqrt(v.size(-1)) # w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights # XD: self.b may be larger than w, so we need to crop it b = self.bias[:, :, : w.size(-2), : w.size(-1)] w = w * b + -1e4 * (1 - b) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = nn.Softmax(dim=-1)(w) w = self.attn_dropout(w) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [torch.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = x.permute(0, 2, 1, 3).contiguous() new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states def split_heads(self, x, k=False): new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states if k: return x.permute(0, 2, 3, 1) else: return x.permute(0, 2, 1, 3) def forward(self, x, attention_mask=None, head_mask=None): x = self.c_attn(x) query, key, value = x.split(self.split_size, dim=2) query = self.split_heads(query) key = self.split_heads(key, k=True) value = self.split_heads(value) attn_outputs = self._attn(query, key, value, attention_mask, head_mask) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a) outputs = [a] + attn_outputs[1:] return outputs # a, (attentions) class MLP(nn.Module): def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.act = ACT_FNS[config.afn] self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, x): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) return self.dropout(h2) class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super().__init__() nx = config.n_embd self.attn = Attention(nx, n_ctx, config, scale) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) def forward(self, x, attention_mask=None, head_mask=None): attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask) a = attn_outputs[0] n = self.ln_1(x + a) m = self.mlp(n) h = self.ln_2(n + m) outputs = [h] + attn_outputs[1:] return outputs class OpenAIGPTPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = OpenAIGPTConfig load_tf_weights = load_tf_weights_in_openai_gpt base_model_prefix = "transformer" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) OPENAI_GPT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.OpenAIGPTConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ OPENAI_GPT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.OpenAIGPTTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.", OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd) self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.init_weights() def get_input_embeddings(self): return self.tokens_embed def set_input_embeddings(self, new_embeddings): self.tokens_embed = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import OpenAIGPTTokenizer, OpenAIGPTModel import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTModel.from_pretrained('openai-gpt') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if position_ids is None: # Code is different from when we had a single embedding matrice from position and token embeddings device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(input_shape[-1], dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layer) if inputs_embeds is None: inputs_embeds = self.tokens_embed(input_ids) position_embeds = self.positions_embed(position_ids) if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) token_type_embeds = self.tokens_embed(token_type_ids) else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) all_attentions = () all_hidden_states = () for i, block in enumerate(self.h): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = block(hidden_states, attention_mask, head_mask[i]) hidden_states = outputs[0] if self.output_attentions: all_attentions = all_attentions + (outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = (hidden_states.view(*output_shape),) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last hidden state, (all hidden states), (all attentions) @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = OpenAIGPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights() def get_output_embeddings(self): return self.lm_head @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, (all hidden states), (all attentions) @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = OpenAIGPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.init_weights() def get_output_embeddings(self): return self.lm_head @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, lm_labels=None, mc_labels=None, ): r""" mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`) Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): Language modeling loss. mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): Multiple choice classification loss. lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) model.resize_token_embeddings(len(tokenizer)) choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_reformer.py ================================================ # coding=utf-8 # Copyright 2020 The Trax Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch REFORMER model. """ import logging import sys from collections import namedtuple from functools import reduce from operator import mul import numpy as np import torch from torch import nn from torch.autograd.function import Function from torch.nn import CrossEntropyLoss from .activations import gelu, gelu_fast, gelu_new, swish from .configuration_reformer import ReformerConfig from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, apply_chunking_to_forward logger = logging.getLogger(__name__) REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/reformer-crime-and-punishment", "google/reformer-enwik8", # See all Reformer models at https://huggingface.co/models?filter=reformer ] def mish(x): return x * torch.tanh(nn.functional.softplus(x)) ACT2FN = { "gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "gelu_fast": gelu_fast, "mish": mish, } # Define named tuples for nn.Modules here LSHSelfAttentionOutput = namedtuple("LSHSelfAttentionOutput", ["hidden_states", "attention_probs", "buckets"]) LocalSelfAttentionOutput = namedtuple("LocalSelfAttentionOutput", ["hidden_states", "attention_probs"]) AttentionOutput = namedtuple("AttentionOutput", ["hidden_states", "attention_probs", "buckets"]) ReformerOutput = namedtuple("ReformerOutput", ["hidden_states", "attn_output", "attention_probs", "buckets"]) ReformerBackwardOutput = namedtuple( "ReformerBackwardOutput", ["attn_output", "hidden_states", "grad_attn_output", "grad_hidden_states"] ) ReformerEncoderOutput = namedtuple("ReformerEncoderOutput", ["hidden_states", "all_hidden_states", "all_attentions"]) def _get_least_common_mult_chunk_len(config): attn_types = config.attn_layers attn_types_set = set(attn_types) if len(attn_types_set) == 1 and attn_types[0] == "lsh": return config.lsh_attn_chunk_length elif len(attn_types_set) == 1 and attn_types[0] == "local": return config.local_attn_chunk_length elif len(attn_types_set) == 2 and attn_types_set == set(["lsh", "local"]): return np.lcm(config.lsh_attn_chunk_length, config.local_attn_chunk_length) else: raise NotImplementedError( "Only attn layer types 'lsh' and 'local' exist, but `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.".format( config.attn_layers ) ) class AxialPositionEmbeddings(nn.Module): """Constructs axial position embeddings. Useful for very long input sequences to save memory and time. """ def __init__(self, config): super().__init__() self.axial_pos_shape = config.axial_pos_shape self.axial_pos_embds_dim = config.axial_pos_embds_dim self.dropout = config.hidden_dropout_prob self.least_common_mult_chunk_length = _get_least_common_mult_chunk_len(config) self.weights = nn.ParameterList() assert ( sum(self.axial_pos_embds_dim) == config.hidden_size ), "Make sure that config.axial_pos_embds factors: {} sum to config.hidden_size: {}".format( self.axial_pos_embds_dim, config.hidden_size ) # create weights for axis, axial_pos_embd_dim in enumerate(self.axial_pos_embds_dim): # create expanded shapes ax_shape = [1] * len(self.axial_pos_shape) ax_shape[axis] = self.axial_pos_shape[axis] ax_shape = tuple(ax_shape) + (axial_pos_embd_dim,) # create tensor and init self.weights.append(nn.Parameter(torch.ones(ax_shape, dtype=torch.float32))) def forward(self, position_ids): # broadcast weights to correct shape batch_size = position_ids.shape[0] sequence_length = position_ids.shape[1] broadcasted_weights = [ weight.expand((batch_size,) + self.axial_pos_shape + weight.shape[-1:]) for weight in self.weights ] if self.training is True: assert ( reduce(mul, self.axial_pos_shape) == sequence_length ), "If training, make sure that config.axial_pos_shape factors: {} multiply to sequence length. Got prod({}) != sequence_length: {}. You might want to consider padding your sequence length to {} or changing config.axial_pos_shape.".format( self.axial_pos_shape, self.axial_pos_shape, sequence_length, reduce(mul, self.axial_pos_shape) ) if self.dropout > 0: weights = torch.cat(broadcasted_weights, dim=-1) # permute weights so that 2D correctly drops dims 1 and 2 transposed_weights = weights.transpose(2, 1) # drop entire matrix of last two dims (prev dims 1 and 2) dropped_transposed_weights = nn.functional.dropout2d( transposed_weights, p=self.dropout, training=self.training ) dropped_weights = dropped_transposed_weights.transpose(2, 1) position_encodings = torch.reshape(dropped_weights, (batch_size, sequence_length, -1)) else: position_encodings = torch.cat( [torch.reshape(weight, (batch_size, sequence_length, -1)) for weight in broadcasted_weights], dim=-1, ) else: assert ( reduce(mul, self.axial_pos_shape) >= sequence_length ), "Make sure that config.axial_pos_shape factors: {} multiply at least to max(sequence_length, least_common_mult_chunk_length): max({}, {})".format( self.axial_pos_shape, sequence_length, self.least_common_mult_chunk_length, ) # reshape axial encodings and use only until sequence_length position_encodings = torch.cat(broadcasted_weights, dim=-1) position_encodings = position_encodings.view(batch_size, -1, position_encodings.shape[-1])[ :, :sequence_length ] return position_encodings class PositionEmbeddings(nn.Module): """Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`. """ def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob self.embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size) def forward(self, position_ids): position_embeddings = self.embedding(position_ids) position_embeddings = nn.functional.dropout(position_embeddings, p=self.dropout, training=self.training) return position_embeddings class ReformerEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.max_position_embeddings = config.max_position_embeddings self.dropout = config.hidden_dropout_prob self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.position_embeddings = ( AxialPositionEmbeddings(config) if config.axial_pos_embds else PositionEmbeddings(config) ) def forward(self, input_ids=None, position_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() device = input_ids.device else: input_shape = inputs_embeds.size()[:-1] device = inputs_embeds.device seq_length = input_shape[1] if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand(input_shape) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) assert ( position_ids.shape[-1] <= self.max_position_embeddings ), "Sequence Length: {} has to be larger equal than config.max_position_embeddings: {}".format( position_ids.shape[-1], self.max_position_embeddings ) # dropout embeddings = nn.functional.dropout(inputs_embeds, p=self.dropout, training=self.training) # add positional embeddings position_embeddings = self.position_embeddings(position_ids) embeddings = embeddings + position_embeddings return embeddings class EfficientAttentionMixin: """ A few utilities for nn.Modules in Reformer, to be used as a mixin. """ def _look_adjacent(self, vectors, num_chunks_before, num_chunks_after): """ Used to implement attention between consecutive chunks. Args: vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...] num_chunks_before: chunks before current chunk to include in attention num_chunks_after: chunks after current chunk to include in attention Returns: tensor of shape [num_chunks, N * chunk_length, ...], where N = (1 + num_chunks_before + num_chunks_after). """ if num_chunks_before == 0 and num_chunks_after == 0: return vectors slices = [] for i in range(-num_chunks_before, num_chunks_after + 1): if i == 0: slices.append(vectors) else: slices.append(torch.cat([vectors[:, :, i:, ...], vectors[:, :, :i, ...]], dim=2)) return torch.cat(slices, dim=3) def _split_hidden_size_dim(self, x, num_attn_heads, attn_head_size): """ splits hidden_size dim into attn_head_size and num_attn_heads """ new_x_shape = x.size()[:-1] + (num_attn_heads, attn_head_size) x = x.view(*new_x_shape) return x.transpose(2, 1) def _merge_hidden_size_dims(self, x, num_attn_heads, attn_head_size): """ merges attn_head_size dim and num_attn_heads dim into hidden_size """ x = x.permute(0, 2, 1, 3) return torch.reshape(x, (x.size()[0], -1, num_attn_heads * attn_head_size)) def _split_seq_length_dim_to(self, vectors, dim_factor_1, dim_factor_2, num_attn_heads, attn_head_size=None): """ splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims """ batch_size = vectors.shape[0] split_dim_shape = (batch_size, num_attn_heads, dim_factor_1, dim_factor_2) if len(vectors.shape) == 4: return torch.reshape(vectors, split_dim_shape + (attn_head_size,)) elif len(vectors.shape) == 3: return torch.reshape(vectors, split_dim_shape) else: raise ValueError("Input vector rank should be one of [3, 4], but is: {}".format(len(vectors.shape))) class LSHSelfAttention(nn.Module, EfficientAttentionMixin): def __init__(self, config): super().__init__() self.config = config self.chunk_length = config.lsh_attn_chunk_length self.num_hashes = config.num_hashes self.num_buckets = config.num_buckets self.num_chunks_before = config.lsh_num_chunks_before self.num_chunks_after = config.lsh_num_chunks_after self.hash_seed = config.hash_seed self.is_decoder = config.is_decoder self.max_position_embeddings = config.max_position_embeddings self.dropout = config.lsh_attention_probs_dropout_prob self.num_attention_heads = config.num_attention_heads self.attention_head_size = config.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size self.hidden_size = config.hidden_size # projection matrices self.query_key = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False) # save mask value here. Need fp32 and fp16 mask values self.register_buffer("self_mask_value_float16", torch.tensor(-1e3)) self.register_buffer("self_mask_value_float32", torch.tensor(-1e5)) self.register_buffer("mask_value_float16", torch.tensor(-1e4)) self.register_buffer("mask_value_float32", torch.tensor(-1e9)) def forward( self, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_attentions=False, buckets=None, **kwargs ): sequence_length = hidden_states.shape[1] batch_size = hidden_states.shape[0] # num hashes can optionally be overwritten by user num_hashes = num_hashes if num_hashes is not None else self.num_hashes # project hidden_states to query_key and value query_key_vectors = self.query_key(hidden_states) value_vectors = self.value(hidden_states) # free memory del hidden_states query_key_vectors = self._split_hidden_size_dim( query_key_vectors, self.num_attention_heads, self.attention_head_size ) value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size) assert ( query_key_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( query_key_vectors.shape[-1], self.attention_head_size ) assert ( value_vectors.shape[-1] == self.attention_head_size ), "last dim of value_vectors is {} but should be {}.".format( value_vectors.shape[-1], self.attention_head_size ) # set `num_buckets` on the fly, recommended way to do it if self.num_buckets is None: self._set_num_buckets(sequence_length) # use cached buckets for backprop only if buckets is None: # hash query key vectors into buckets buckets = self._hash_vectors(query_key_vectors, num_hashes) assert ( int(buckets.shape[-1]) == num_hashes * sequence_length ), "last dim of buckets is {}, but should be {}".format(buckets.shape[-1], num_hashes * sequence_length) sorted_bucket_idx, undo_sorted_bucket_idx = self._get_sorted_bucket_idx_and_undo_sorted_bucket_idx( sequence_length, buckets, num_hashes ) # make sure bucket idx is not longer then sequence length sorted_bucket_idx = sorted_bucket_idx % sequence_length # cluster query key value vectors according to hashed buckets query_key_vectors = self._gather_by_expansion(query_key_vectors, sorted_bucket_idx, num_hashes) value_vectors = self._gather_by_expansion(value_vectors, sorted_bucket_idx, num_hashes) query_key_vectors = self._split_seq_length_dim_to( query_key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) value_vectors = self._split_seq_length_dim_to( value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) if self.chunk_length is None: assert ( self.num_chunks_before == 0 and self.num_chunks_after == 0 ), "If `config.chunk_length` is `None`, make sure `config.num_chunks_after` and `config.num_chunks_before` are set to 0." # scale key vectors key_vectors = self._len_and_dim_norm(query_key_vectors) # get attention probs out_vectors, logits, attention_probs = self._attend( query_vectors=query_key_vectors, key_vectors=key_vectors, value_vectors=value_vectors, sorted_bucket_idx=sorted_bucket_idx, attention_mask=attention_mask, head_mask=head_mask, ) # free memory del query_key_vectors, key_vectors, value_vectors # sort clusters back to correct ordering out_vectors, logits = ReverseSort.apply( out_vectors, logits, sorted_bucket_idx, undo_sorted_bucket_idx, self.num_hashes ) # sum up all hash rounds if num_hashes > 1: out_vectors = self._split_seq_length_dim_to( out_vectors, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size, ) logits = self._split_seq_length_dim_to( logits, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size, ).unsqueeze(-1) probs_vectors = torch.exp(logits - torch.logsumexp(logits, dim=2, keepdim=True)) out_vectors = torch.sum(out_vectors * probs_vectors, dim=2) # free memory del probs_vectors # free memory del logits assert out_vectors.shape == ( batch_size, self.num_attention_heads, sequence_length, self.attention_head_size, ), "out_vectors have be of shape `[batch_size, config.num_attention_heads, sequence_length, config.attention_head_size]`." out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size) if do_output_attentions is False: attention_probs = () return LSHSelfAttentionOutput(hidden_states=out_vectors, attention_probs=attention_probs, buckets=buckets) def _hash_vectors(self, vectors, num_hashes): batch_size = vectors.shape[0] # See https://arxiv.org/pdf/1509.02897.pdf # We sample a different random rotation for each round of hashing to # decrease the probability of hash misses. if isinstance(self.num_buckets, int): assert ( self.num_buckets % 2 == 0 ), "There should be an even number of bucktes, but `self.num_bucktes`: {}".format(self.num_buckets) rotation_size = self.num_buckets num_buckets = self.num_buckets else: # Factorize the hash if self.num_buckets is a list or tuple rotation_size, num_buckets = 0, 1 for bucket_factor in self.num_buckets: assert bucket_factor % 2 == 0, "The number of buckets should be even, but `num_bucket`: {}".format( bucket_factor ) rotation_size = rotation_size + bucket_factor num_buckets = num_buckets * bucket_factor # remove gradient vectors = vectors.detach() if self.hash_seed is not None: # for determinism torch.manual_seed(self.hash_seed) rotations_shape = (self.num_attention_heads, vectors.shape[-1], num_hashes, rotation_size // 2) # create a random self.attention_head_size x num_hashes x num_buckets/2 random_rotations = torch.randn(rotations_shape, device=vectors.device, dtype=vectors.dtype) # Output dim: Batch_Size x Num_Attn_Heads x Num_Hashes x Seq_Len x Num_Buckets/2 rotated_vectors = torch.einsum("bmtd,mdhr->bmhtr", vectors, random_rotations) if isinstance(self.num_buckets, int) or len(self.num_buckets) == 1: rotated_vectors = torch.cat([rotated_vectors, -rotated_vectors], dim=-1) buckets = torch.argmax(rotated_vectors, dim=-1) else: # Get the buckets for them and combine. buckets, cur_sum, cur_product = None, 0, 1 for bucket_factor in self.num_buckets: rotated_vectors_factor = rotated_vectors[..., cur_sum : cur_sum + (bucket_factor // 2)] cur_sum = cur_sum + bucket_factor // 2 rotated_vectors_factor = torch.cat([rotated_vectors_factor, -rotated_vectors_factor], dim=-1) if buckets is None: buckets = torch.argmax(rotated_vectors_factor, dim=-1) else: buckets = buckets + (cur_product * torch.argmax(rotated_vectors_factor, dim=-1)) cur_product = cur_product * bucket_factor # buckets is now (Batch_size x Num_Attn_Heads x Num_Hashes x Seq_Len). # Next we add offsets so that bucket numbers from different hashing rounds don't overlap. offsets = torch.arange(num_hashes, device=vectors.device) offsets = (offsets * num_buckets).view((1, 1, -1, 1)) # expand to batch size and num attention heads offsets = offsets.expand((batch_size, self.num_attention_heads) + offsets.shape[-2:]) offset_buckets = (buckets + offsets).flatten(start_dim=2, end_dim=3) return offset_buckets def _get_sorted_bucket_idx_and_undo_sorted_bucket_idx(self, sequence_length, buckets, num_hashes): # no gradients are needed with torch.no_grad(): batch_size = buckets.shape[0] # arange and expand orig_indices = torch.arange(num_hashes * sequence_length, device=buckets.device).view(1, 1, -1) orig_indices = orig_indices.expand(batch_size, self.num_attention_heads, orig_indices.shape[-1]) # scale buckets scaled_buckets = sequence_length * buckets + (orig_indices % sequence_length) # remove gradient scaled_buckets = scaled_buckets.detach() # Hash-based sort sorted_bucket_idx = torch.argsort(scaled_buckets, dim=-1) # create simple indices to scatter to, to have undo sort indices = ( torch.arange(sorted_bucket_idx.shape[-1], device=buckets.device) .view(1, 1, -1) .expand(sorted_bucket_idx.shape) ) # get undo sort undo_sorted_bucket_idx = sorted_bucket_idx.new(*sorted_bucket_idx.size()) undo_sorted_bucket_idx.scatter_(-1, sorted_bucket_idx, indices) return sorted_bucket_idx, undo_sorted_bucket_idx def _set_num_buckets(self, sequence_length): # `num_buckets` should be set to 2 * sequence_length // chunk_length as recommended in paper num_buckets_pow_2 = (2 * (sequence_length // self.chunk_length)).bit_length() - 1 # make sure buckets are power of 2 num_buckets = 2 ** num_buckets_pow_2 # factorize `num_buckets` if `num_buckets` becomes too large num_buckets_limit = 2 * max( int((self.max_position_embeddings // self.chunk_length) ** (0.5)), self.chunk_length, ) if num_buckets > num_buckets_limit: num_buckets = [2 ** (num_buckets_pow_2 // 2), 2 ** (num_buckets_pow_2 - num_buckets_pow_2 // 2)] logger.warning("config.num_buckets is not set. Setting config.num_buckets to {}...".format(num_buckets)) # set num buckets in config to be properly saved self.config.num_buckets = num_buckets self.num_buckets = num_buckets def _attend( self, query_vectors, key_vectors, value_vectors, sorted_bucket_idx, attention_mask, head_mask, ): key_vectors = self._look_adjacent(key_vectors, self.num_chunks_before, self.num_chunks_after) value_vectors = self._look_adjacent(value_vectors, self.num_chunks_before, self.num_chunks_after) # get logits and dots query_key_dots = torch.matmul(query_vectors, key_vectors.transpose(-1, -2)) # free memory del query_vectors, key_vectors query_bucket_idx = self._split_seq_length_dim_to( sorted_bucket_idx, -1, self.chunk_length, self.num_attention_heads ) key_value_bucket_idx = self._look_adjacent(query_bucket_idx, self.num_chunks_before, self.num_chunks_after) # get correct mask values depending on precision if query_key_dots.dtype == torch.float16: self_mask_value = self.self_mask_value_float16.half() mask_value = self.mask_value_float16.half() else: self_mask_value = self.self_mask_value_float32 mask_value = self.mask_value_float32 mask = self._compute_attn_mask(query_bucket_idx, key_value_bucket_idx, attention_mask) if mask is not None: query_key_dots = torch.where(mask, query_key_dots, mask_value) # free memory del mask # Self mask is ALWAYS applied. # From the reformer paper (https://arxiv.org/pdf/2001.04451.pdf): # " While attention to the future is not allowed, typical implementations of the # Transformer do allow a position to attend to itself. # Such behavior is undesirable in a shared-QK formulation because the dot-product # of a query vector with itself will almost always be greater than the dot product of a # query vector with a vector at another position. We therefore modify the masking # to forbid a token from attending to itself, except in situations # where a token has no other valid attention targets (e.g. the first token in a sequence) " self_mask = torch.ne(query_bucket_idx.unsqueeze(-1), key_value_bucket_idx.unsqueeze(-2)).to( query_bucket_idx.device ) # apply self_mask query_key_dots = torch.where(self_mask, query_key_dots, self_mask_value) # free memory del self_mask logits = torch.logsumexp(query_key_dots, dim=-1, keepdim=True) # dots shape is `[batch_size, num_attn_heads, num_hashes * seq_len // chunk_length, chunk_length, chunk_length * (1 + num_chunks_before + num_chunks_after)]` attention_probs = torch.exp(query_key_dots - logits) # free memory del query_key_dots # dropout attention_probs = nn.functional.dropout(attention_probs, p=self.dropout, training=self.training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask # attend values out_vectors = torch.matmul(attention_probs, value_vectors) # free memory del value_vectors # merge chunk length logits = logits.flatten(start_dim=2, end_dim=3).squeeze(-1) out_vectors = out_vectors.flatten(start_dim=2, end_dim=3) return out_vectors, logits, attention_probs def _compute_attn_mask(self, query_indices, key_indices, attention_mask): mask = None # Causal mask if self.is_decoder: mask = torch.ge(query_indices.unsqueeze(-1), key_indices.unsqueeze(-2)).to(query_indices.device) # Attention mask: chunk, look up correct mask value from key_value_bucket_idx # IMPORTANT: official trax code does not use a mask for LSH Atttention. Not sure why. if attention_mask is not None: attention_mask = attention_mask.to(torch.uint8)[:, None, None, :] # expand attn_mask to fit with key_value_bucket_idx shape attention_mask = attention_mask.expand(query_indices.shape[:-1] + (-1,)) key_attn_mask = torch.gather(attention_mask, -1, key_indices) query_attn_mask = torch.gather(attention_mask, -1, query_indices) # expand to query_key_dots shape: duplicate along query axis since key sorting is the same for each query position in chunk attn_mask = query_attn_mask.unsqueeze(-1) * key_attn_mask.unsqueeze(-2) # free memory del query_attn_mask, key_attn_mask, attention_mask # multiply by casaul mask if necessary if mask is not None: mask = mask * attn_mask else: mask = attn_mask return mask def _len_and_dim_norm(self, vectors): """ length and attention head size dim normalization """ vectors = self._len_norm(vectors) vectors = vectors * torch.rsqrt( torch.tensor(self.attention_head_size, device=vectors.device, dtype=vectors.dtype) ) return vectors def _len_norm(self, x, epsilon=1e-6): """ length normalization """ variance = torch.mean(x ** 2, -1, keepdim=True) norm_x = x * torch.rsqrt(variance + epsilon) return norm_x def _gather_by_expansion(self, vectors, idxs, num_hashes): """ expand dims of idxs and vectors for all hashes and gather """ expanded_idxs = idxs.unsqueeze(-1).expand(-1, -1, -1, self.attention_head_size) vectors = vectors.repeat(1, 1, num_hashes, 1) return torch.gather(vectors, 2, expanded_idxs) class ReverseSort(Function): """ After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized backward function is used for Reformer, the gradients of the output vectors have to be explicitely sorted here. """ @staticmethod def forward(ctx, out_vectors, logits, sorted_bucket_idx, undo_sorted_bucket_idx, num_hashes): # save sorted_bucket_idx for backprop with torch.no_grad(): ctx.sorted_bucket_idx = sorted_bucket_idx ctx.num_hashes = num_hashes # undo sort to have correct order for next layer expanded_undo_sort_indices = undo_sorted_bucket_idx.unsqueeze(-1).expand(out_vectors.shape) out_vectors = torch.gather(out_vectors, 2, expanded_undo_sort_indices) logits = torch.gather(logits, 2, undo_sorted_bucket_idx) return out_vectors, logits @staticmethod def backward(ctx, grad_out_vectors, grad_logits): # get parameters saved in ctx sorted_bucket_idx = ctx.sorted_bucket_idx num_hashes = ctx.num_hashes # get real gradient shape # shape is BatchSize x NumAttnHeads x ChunkLen * NumHashes grad_logits_shape = grad_logits.shape # shape is BatchSize x NumAttnHeads x ChunkLen * NumHashes x ChunkLen grad_out_vectors_shape = grad_out_vectors.shape # split gradient vectors and sorted bucket idxs by concatenated chunk dimension to gather correct indices # shape is BatchSize x NumAttnHeads x NumHashes x ChunkLen grad_logits = grad_logits.view((grad_logits_shape[:2] + (num_hashes, -1))) # shape is BatchSize x NumAttnHeads x NumHashes x ChunkLen x ChunkLen grad_out_vectors = grad_out_vectors.view( (grad_out_vectors_shape[:2] + (num_hashes, -1) + grad_out_vectors_shape[-1:]) ) # reshape and expand sorted_bucket_idx = torch.reshape(sorted_bucket_idx, (sorted_bucket_idx.shape[:2] + (num_hashes, -1))) expanded_sort_indices = sorted_bucket_idx.unsqueeze(-1).expand(grad_out_vectors.shape) # reverse sort of forward grad_out_vectors = torch.gather(grad_out_vectors, 3, expanded_sort_indices) grad_logits = torch.gather(grad_logits, 3, sorted_bucket_idx) # reshape into correct shape grad_logits = torch.reshape(grad_logits, grad_logits_shape) grad_out_vectors = torch.reshape(grad_out_vectors, grad_out_vectors_shape) # return grad and `None` fillers for last 3 forward args return grad_out_vectors, grad_logits, None, None, None class LocalSelfAttention(nn.Module, EfficientAttentionMixin): def __init__(self, config): super().__init__() self.num_attention_heads = config.num_attention_heads self.chunk_length = config.local_attn_chunk_length self.num_chunks_before = config.local_num_chunks_before self.num_chunks_after = config.local_num_chunks_after self.is_decoder = config.is_decoder self.pad_token_id = config.pad_token_id self.attention_head_size = config.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size self.hidden_size = config.hidden_size # projection matrices self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.dropout = config.local_attention_probs_dropout_prob # save mask value here self.register_buffer("mask_value_float16", torch.tensor(-1e4)) self.register_buffer("mask_value_float32", torch.tensor(-1e9)) def forward(self, hidden_states, attention_mask=None, head_mask=None, do_output_attentions=False, **kwargs): sequence_length = hidden_states.shape[1] batch_size = hidden_states.shape[0] # project hidden_states to query, key and value query_vectors = self.query(hidden_states) key_vectors = self.key(hidden_states) value_vectors = self.value(hidden_states) # split last dim into `config.num_attention_heads` and `config.attention_head_size` query_vectors = self._split_hidden_size_dim(query_vectors, self.num_attention_heads, self.attention_head_size) key_vectors = self._split_hidden_size_dim(key_vectors, self.num_attention_heads, self.attention_head_size) value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size) assert ( query_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( query_vectors.shape[-1], self.attention_head_size ) assert ( key_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( key_vectors.shape[-1], self.attention_head_size ) assert ( value_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( value_vectors.shape[-1], self.attention_head_size ) if self.chunk_length is None: assert ( self.num_chunks_before == 0 and self.num_chunks_after == 0 ), "If `config.chunk_length` is `None`, make sure `config.num_chunks_after` and `config.num_chunks_before` are set to 0." # normalize key vectors key_vectors = key_vectors / torch.sqrt( torch.tensor(self.attention_head_size, device=key_vectors.device, dtype=key_vectors.dtype) ) # chunk vectors # B x Num_Attn_Head x Seq_Len // chunk_len x chunk_len x attn_head_size query_vectors = self._split_seq_length_dim_to( query_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) key_vectors = self._split_seq_length_dim_to( key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) value_vectors = self._split_seq_length_dim_to( value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) # chunk indices indices = torch.arange(sequence_length, device=query_vectors.device).repeat( batch_size, self.num_attention_heads, 1 ) query_indices = self._split_seq_length_dim_to(indices, -1, self.chunk_length, self.num_attention_heads) key_indices = self._split_seq_length_dim_to(indices, -1, self.chunk_length, self.num_attention_heads) # append chunks before and after key_vectors = self._look_adjacent(key_vectors, self.num_chunks_before, self.num_chunks_after) value_vectors = self._look_adjacent(value_vectors, self.num_chunks_before, self.num_chunks_after) key_indices = self._look_adjacent(key_indices, self.num_chunks_before, self.num_chunks_after) query_key_dots = torch.matmul(query_vectors, key_vectors.transpose(-1, -2)) # free memory del query_vectors, key_vectors mask = self._compute_attn_mask(query_indices, key_indices, attention_mask, query_key_dots.shape) if mask is not None: # get mask tensor depending on half precision or not if query_key_dots.dtype == torch.float16: mask_value = self.mask_value_float16.half() else: mask_value = self.mask_value_float32 query_key_dots = torch.where(mask, query_key_dots, mask_value) # free memory del mask # softmax logits = torch.logsumexp(query_key_dots, dim=-1, keepdim=True) attention_probs = torch.exp(query_key_dots - logits) # free memory del logits # dropout attention_probs = nn.functional.dropout(attention_probs, p=self.dropout, training=self.training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask # attend values out_vectors = torch.matmul(attention_probs, value_vectors) # free memory del value_vectors # merge chunk length out_vectors = out_vectors.flatten(start_dim=2, end_dim=3) assert out_vectors.shape == (batch_size, self.num_attention_heads, sequence_length, self.attention_head_size,) out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size) if do_output_attentions is False: attention_probs = () return LocalSelfAttentionOutput(hidden_states=out_vectors, attention_probs=attention_probs) def _compute_attn_mask(self, query_indices, key_indices, attention_mask, query_key_dots_shape): mask = None # chunk attention mask and look before and after if attention_mask is not None: attention_mask = attention_mask.to(torch.uint8)[:, None, :] attention_mask = self._split_seq_length_dim_to(attention_mask, -1, self.chunk_length, 1) attention_mask_key = self._look_adjacent(attention_mask, self.num_chunks_before, self.num_chunks_after) # Causal mask if self.is_decoder is True: mask = torch.ge(query_indices.unsqueeze(-1), key_indices.unsqueeze(-2)).to(query_indices.device) # Attention mask if attention_mask is not None: # create attn_mask attn_mask = (attention_mask.unsqueeze(-1) * attention_mask_key.unsqueeze(-2)).expand(query_key_dots_shape) # multiply by casaul mask if necessary if mask is not None: mask = mask * attn_mask else: mask = attn_mask return mask class ReformerSelfOutput(nn.Module): def __init__(self, config): super().__init__() all_head_size = config.num_attention_heads * config.attention_head_size self.dropout = config.hidden_dropout_prob self.dense = nn.Linear(all_head_size, config.hidden_size, bias=False) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) return hidden_states class ReformerAttention(nn.Module): def __init__(self, config, layer_id=0): super().__init__() self.layer_id = layer_id self.attn_layers = config.attn_layers self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if len(set(self.attn_layers)) == 1 and self.attn_layers[0] == "lsh": self.self_attention = LSHSelfAttention(config) elif len(set(self.attn_layers)) == 1 and self.attn_layers[0] == "local": self.self_attention = LocalSelfAttention(config) elif len(set(self.attn_layers)) == 2 and set(self.attn_layers) == set(["lsh", "local"]): # get correct attn layers if self.attn_layers[self.layer_id] == "lsh": self.self_attention = LSHSelfAttention(config) else: self.self_attention = LocalSelfAttention(config) else: raise NotImplementedError( "Only attn layer types 'lsh' and 'local' exist, but got `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.".format( self.attn_layers ) ) self.output = ReformerSelfOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_attentions=False, buckets=None, ): hidden_states = self.layer_norm(hidden_states) # use cached buckets for backprob if buckets not None for LSHSelfAttention self_attention_outputs = self.self_attention( hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, num_hashes=num_hashes, do_output_attentions=do_output_attentions, buckets=buckets, ) attention_output = self.output(self_attention_outputs.hidden_states) # add buckets if necessary if hasattr(self_attention_outputs, "buckets"): buckets = self_attention_outputs.buckets else: buckets = None return AttentionOutput( hidden_states=attention_output, attention_probs=self_attention_outputs.attention_probs, buckets=buckets, ) class ReformerFeedForwardDense(nn.Module): def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob if isinstance(config.hidden_act, str): self.act_fn = ACT2FN[config.hidden_act] else: self.act_fn = config.hidden_act self.dense = nn.Linear(config.hidden_size, config.feed_forward_size) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = self.act_fn(hidden_states) return hidden_states class ReformerFeedForwardOutput(nn.Module): def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob self.dense = nn.Linear(config.feed_forward_size, config.hidden_size) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) return hidden_states class ChunkReformerFeedForward(nn.Module): def __init__(self, config): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dense = ReformerFeedForwardDense(config) self.output = ReformerFeedForwardOutput(config) def forward(self, attention_output): return apply_chunking_to_forward( self.chunk_size_feed_forward, self.seq_len_dim, self.forward_chunk, attention_output, ) def forward_chunk(self, hidden_states): hidden_states = self.layer_norm(hidden_states) hidden_states = self.dense(hidden_states) return self.output(hidden_states) class ReformerLayer(nn.Module): def __init__(self, config, layer_id=0): super().__init__() self.attention = ReformerAttention(config, layer_id) # dropout requires to have the same # seed for forward and backward pass self.attention_seed = None self.feed_forward_seed = None self.feed_forward = ChunkReformerFeedForward(config) def _init_attention_seed(self): """ This function sets a new seed for the attention layer to make dropout deterministic for both forward calls: 1 normal forward call and 1 forward call in backward to recalculate activations. """ # randomize seeds if next(self.parameters()).device.type == "cuda": # GPU device_idx = torch.cuda.current_device() self.attention_seed = torch.cuda.default_generators[device_idx].seed() torch.cuda.manual_seed(self.attention_seed) else: # CPU self.attention_seed = int(torch.seed() % sys.maxsize) torch.manual_seed(self.attention_seed) def _init_feed_forward_seed(self): """ This function sets a new seed for the feed forward layer to make dropout deterministic for both forward calls: 1 normal forward call and 1 forward call in backward to recalculate activations. """ # randomize seeds if next(self.parameters()).device.type == "cuda": # GPU device_idx = torch.cuda.current_device() self.feed_forward_seed = torch.cuda.default_generators[device_idx].seed() torch.cuda.manual_seed(self.feed_forward_seed) else: # CPU self.feed_forward_seed = int(torch.seed() % sys.maxsize) torch.manual_seed(self.feed_forward_seed) def forward( self, prev_attn_output, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_attentions=False, ): with torch.no_grad(): # every forward pass we sample a different seed # for dropout and save for forward fn in backward pass # to have correct dropout self._init_attention_seed() attn_outputs = self.attention( hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, num_hashes=num_hashes, do_output_attentions=do_output_attentions, ) attn_output = attn_outputs.hidden_states # Implementation of RevNet (see Fig. 6 in https://towardsdatascience.com/illustrating-the-reformer-393575ac6ba0) # Y_1 = X_1 + f(X_2) attn_output = prev_attn_output + attn_output # free memory del prev_attn_output # every forward pass we sample a different seed # for dropout and save seed for forward fn in backward # to have correct dropout self._init_feed_forward_seed() # Y_2 = X_2 + g(Y_1) hidden_states = hidden_states + self.feed_forward(attn_output) return ReformerOutput( attn_output=attn_output, hidden_states=hidden_states, attention_probs=attn_outputs.attention_probs, buckets=attn_outputs.buckets, ) def backward_pass( self, next_attn_output, hidden_states, grad_attn_output, grad_hidden_states, attention_mask=None, head_mask=None, buckets=None, ): # Implements the backward pass for reversible ResNets. # A good blog post on how this works can be found here: # Implementation of RevNet (see Fig. 6 in https://towardsdatascience.com/illustrating-the-reformer-393575ac6ba0) # This code is heavily inspired by https://github.com/lucidrains/reformer-pytorch/blob/master/reformer_pytorch/reversible.py with torch.enable_grad(): next_attn_output.requires_grad = True # set seed to have correct dropout torch.manual_seed(self.feed_forward_seed) # g(Y_1) res_hidden_states = self.feed_forward(next_attn_output) res_hidden_states.backward(grad_hidden_states, retain_graph=True) with torch.no_grad(): # X_2 = Y_2 - g(Y_1) hidden_states = hidden_states - res_hidden_states del res_hidden_states grad_attn_output = grad_attn_output + next_attn_output.grad next_attn_output.grad = None with torch.enable_grad(): hidden_states.requires_grad = True # set seed to have correct dropout torch.manual_seed(self.attention_seed) # f(X_2) # use cached buckets for backprob if buckets not None for LSHSelfAttention output = self.attention( hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, buckets=buckets, ).hidden_states output.backward(grad_attn_output, retain_graph=True) with torch.no_grad(): # X_1 = Y_1 - f(X_2) attn_output = next_attn_output - output del output, next_attn_output grad_hidden_states = grad_hidden_states + hidden_states.grad hidden_states.grad = None hidden_states = hidden_states.detach() return ReformerBackwardOutput( attn_output=attn_output, hidden_states=hidden_states, grad_attn_output=grad_attn_output, grad_hidden_states=grad_hidden_states, ) class _ReversibleFunction(Function): """ To prevent PyTorch from performing the usual backpropagation, a customized backward function is implemented here. This way it is made sure that no memory expensive activations are saved during the forward pass. This function is heavily inspired by https://github.com/lucidrains/reformer-pytorch/blob/master/reformer_pytorch/reversible.py """ @staticmethod def forward( ctx, hidden_states, layers, attention_mask, head_mask, num_hashes, all_hidden_states, all_attentions, do_output_hidden_states, do_output_attentions, ): all_buckets = () # split duplicated tensor hidden_states, attn_output = torch.chunk(hidden_states, 2, dim=-1) for layer, layer_head_mask in zip(layers, head_mask): if do_output_hidden_states is True: all_hidden_states.append(hidden_states) layer_outputs = layer( prev_attn_output=attn_output, hidden_states=hidden_states, attention_mask=attention_mask, head_mask=layer_head_mask, num_hashes=num_hashes, do_output_attentions=do_output_attentions, ) attn_output = layer_outputs.attn_output hidden_states = layer_outputs.hidden_states all_buckets = all_buckets + (layer_outputs.buckets,) if do_output_attentions: all_attentions.append(layer_outputs.attention_probs) # Add last layer if do_output_hidden_states is True: all_hidden_states.append(hidden_states) # attach params to ctx for backward ctx.save_for_backward(attn_output.detach(), hidden_states.detach()) ctx.layers = layers ctx.all_buckets = all_buckets ctx.head_mask = head_mask ctx.attention_mask = attention_mask # Concatenate 2 RevNet outputs return torch.cat([attn_output, hidden_states], dim=-1) @staticmethod def backward(ctx, grad_hidden_states): grad_attn_output, grad_hidden_states = torch.chunk(grad_hidden_states, 2, dim=-1) # retrieve params from ctx for backward attn_output, hidden_states = ctx.saved_tensors # create tuple output = ReformerBackwardOutput( attn_output=attn_output, hidden_states=hidden_states, grad_attn_output=grad_attn_output, grad_hidden_states=grad_hidden_states, ) # free memory del grad_attn_output, grad_hidden_states, attn_output, hidden_states layers = ctx.layers all_buckets = ctx.all_buckets head_mask = ctx.head_mask attention_mask = ctx.attention_mask for idx, layer in enumerate(layers[::-1]): # pop last buckets from stack buckets = all_buckets[-1] all_buckets = all_buckets[:-1] # backprop output = layer.backward_pass( next_attn_output=output.attn_output, hidden_states=output.hidden_states, grad_attn_output=output.grad_attn_output, grad_hidden_states=output.grad_hidden_states, head_mask=head_mask[len(layers) - idx - 1], attention_mask=attention_mask, buckets=buckets, ) assert all_buckets == (), "buckets have to be empty after backpropagation" grad_hidden_states = torch.cat([output.grad_attn_output, output.grad_hidden_states], dim=-1) # num of return vars has to match num of forward() args # return gradient for hidden_states arg and None for other args return grad_hidden_states, None, None, None, None, None, None, None, None class ReformerEncoder(nn.Module): def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob self.layers = nn.ModuleList([ReformerLayer(config, i) for i in range(config.num_hidden_layers)]) # Reformer is using Rev Nets, thus last layer outputs are concatenated and # Layer Norm is done over 2 * hidden_size self.layer_norm = nn.LayerNorm(2 * config.hidden_size, eps=config.layer_norm_eps) def forward( self, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_hidden_states=False, do_output_attentions=False, ): # hidden_states and attention lists to be filled if wished all_hidden_states = [] all_attentions = [] # concat same tensor for reversible ResNet hidden_states = torch.cat([hidden_states, hidden_states], dim=-1) hidden_states = _ReversibleFunction.apply( hidden_states, self.layers, attention_mask, head_mask, num_hashes, all_hidden_states, all_attentions, do_output_hidden_states, do_output_attentions, ) # Apply layer norm to concatenated hidden states hidden_states = self.layer_norm(hidden_states) # Apply dropout hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) return ReformerEncoderOutput( hidden_states=hidden_states, all_hidden_states=all_hidden_states, all_attentions=all_attentions ) class ReformerOnlyLMHead(nn.Module): def __init__(self, config): super().__init__() # Reformer is using Rev Nets, thus last layer outputs are concatenated and # Layer Norm is done over 2 * hidden_size self.seq_len_dim = 1 self.chunk_size_lm_head = config.chunk_size_lm_head self.decoder = nn.Linear(2 * config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states) def forward_chunk(self, hidden_states): hidden_states = self.decoder(hidden_states) return hidden_states class ReformerPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = ReformerConfig base_model_prefix = "reformer" @property def dummy_inputs(self): input_ids = torch.tensor(DUMMY_INPUTS) input_mask = torch.tensor(DUMMY_MASK) dummy_inputs = { "input_ids": input_ids, "attention_mask": input_mask, } return dummy_inputs def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, AxialPositionEmbeddings): for weight in module.weights: torch.nn.init.normal_(weight, std=self.config.axial_norm_std) elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() REFORMER_START_DOCSTRING = r""" Reformer was proposed in `Reformer: The Efficient Transformer`_ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. .. _`Reformer: The Efficient Transformer`: https://arxiv.org/abs/2001.04451 This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.ReformerConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ REFORMER_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. During training the input_ids sequence_length has to be a multiple of the relevant model's chunk lengths (lsh's, local's or both). During evaluation, the indices are automatically padded to be a multiple of the chunk length. Indices can be obtained using :class:`transformers1.ReformerTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. num_hashes (:obj:`int`, `optional`, defaults to :obj:`None`): `num_hashes` is the number of hashing rounds that should be performed during bucketing. Setting `num_hashes` overwrites the default `num_hashes` defined in `config.num_hashes`. For more information, see `num_hashes` in :class:`transformers1.ReformerConfig`. """ @add_start_docstrings( "The bare Reformer Model transformer outputting raw hidden-states" "without any specific head on top.", REFORMER_START_DOCSTRING, ) class ReformerModel(ReformerPreTrainedModel): def __init__(self, config): super().__init__(config) self.config = config assert ( self.config.num_hidden_layers > 0 ), "`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local']" self.embeddings = ReformerEmbeddings(config) self.encoder = ReformerEncoder(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, num_hashes=None, do_output_hidden_states=False, do_output_attentions=False, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``do_output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ReformerModel, ReformerTokenizer import torch tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment') model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # TODO(PVP): delete when PR to change output_attentions is made do_output_attentions = self.config.output_attentions do_output_hidden_states = self.config.output_hidden_states if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() # noqa: F841 device = input_ids.device elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] # noqa: F841 device = inputs_embeds.device else: raise ValueError("You have to specify either input_ids or inputs_embeds") assert ( len(input_shape) == 2 ), "`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {}".format(input_shape) # prepare head mask head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers, is_attention_chunked=True) # original sequence length for padding orig_sequence_length = input_shape[-1] # if needs padding least_common_mult_chunk_length = _get_least_common_mult_chunk_len(self.config) must_pad_to_match_chunk_length = input_shape[-1] % least_common_mult_chunk_length != 0 if must_pad_to_match_chunk_length: padding_length = least_common_mult_chunk_length - input_shape[-1] % least_common_mult_chunk_length if self.training is True: raise ValueError( "If training, sequence Length {} has to be a multiple of least common multiple chunk_length {}. Please consider padding the input to a length of {}.".format( input_shape[-1], least_common_mult_chunk_length, input_shape[-1] + padding_length ) ) # pad input input_ids, inputs_embeds, attention_mask, position_ids, input_shape = self._pad_to_mult_of_chunk_length( input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, input_shape=input_shape, padding_length=padding_length, padded_seq_length=least_common_mult_chunk_length, device=device, ) embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds) encoder_outputs = self.encoder( hidden_states=embedding_output, head_mask=head_mask, attention_mask=attention_mask, num_hashes=num_hashes, do_output_hidden_states=do_output_hidden_states, do_output_attentions=do_output_attentions, ) sequence_output = encoder_outputs.hidden_states # if padding was applied if must_pad_to_match_chunk_length: sequence_output = sequence_output[:, :orig_sequence_length] outputs = (sequence_output,) # TODO(PVP): Replace by named tuple after namedtuples are introduced in the library. if do_output_hidden_states is True: outputs = outputs + (encoder_outputs.all_hidden_states,) if do_output_attentions is True: outputs = outputs + (encoder_outputs.all_attentions,) return outputs def _pad_to_mult_of_chunk_length( self, input_ids, inputs_embeds=None, attention_mask=None, position_ids=None, input_shape=None, padding_length=None, padded_seq_length=None, device=None, ): logger.info( "Input ids are automatically padded from {} to {} to be a multiple of `config.chunk_length`: {}".format( input_shape[-1], input_shape[-1] + padding_length, padded_seq_length ) ) padded_input_ids = torch.full( (input_shape[0], padding_length), self.config.pad_token_id, device=device, dtype=torch.long, ) # Extend `attention_mask` if attention_mask is not None: attention_mask = torch.cat( [ attention_mask, torch.zeros(input_shape[0], padding_length, device=device, dtype=attention_mask.dtype,), ], dim=-1, ) else: attention_mask = torch.cat( [ torch.ones(input_shape, device=device, dtype=torch.uint8), torch.zeros((input_shape[0], padding_length), device=device, dtype=torch.uint8), ], dim=-1, ) # Extend `input_ids` with padding to match least common multiple chunk_length if input_ids is not None: input_ids = torch.cat([input_ids, padded_input_ids], dim=-1) input_shape = input_ids.size() # Pad position ids if given if position_ids is not None: padded_position_ids = torch.arange(input_shape[-1], padded_seq_length, dtype=torch.long, device=device) padded_position_ids = position_ids.unsqueeze(0).expand(input_shape[0], padding_length) position_ids = torch.cat([position_ids, padded_position_ids], dim=-1) # Extend `inputs_embeds` with padding to match least common multiple chunk_length if inputs_embeds is not None: padded_inputs_embeds = self.embeddings(padded_input_ids, position_ids) inputs_embeds = torch.cat([inputs_embeds, padded_inputs_embeds], dim=-2) input_shape = inputs_embeds.size() return input_ids, inputs_embeds, attention_mask, position_ids, input_shape @add_start_docstrings("""Reformer Model with a `language modeling` head on top. """, REFORMER_START_DOCSTRING) class ReformerModelWithLMHead(ReformerPreTrainedModel): def __init__(self, config): super().__init__(config) self.reformer = ReformerModel(config) self.lm_head = ReformerOnlyLMHead(config) self.init_weights() def get_output_embeddings(self): return self.lm_head.decoder def tie_weights(self): # word embeddings are not tied in Reformer pass @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) def forward( self, input_ids=None, position_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, num_hashes=None, labels=None, do_output_hidden_states=False, do_output_attentions=False, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): Classification loss (cross entropy). prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``do_output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ReformerModelWithLMHead, ReformerTokenizer import torch tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment') model = ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ reformer_outputs = self.reformer( input_ids, position_ids=position_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, num_hashes=num_hashes, do_output_hidden_states=do_output_hidden_states, do_output_attentions=do_output_attentions, ) sequence_output = reformer_outputs[0] logits = self.lm_head(sequence_output) outputs = (logits,) + reformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm_loss), lm_logits, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # TODO(PVP): Add smart caching inputs_dict = {"input_ids": input_ids} if "num_hashes" in kwargs: inputs_dict["num_hashes"] = kwargs["num_hashes"] return inputs_dict ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch RoBERTa model. """ import logging import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_roberta import RobertaConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu from .modeling_utils import create_position_ids_from_input_ids logger = logging.getLogger(__name__) ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "roberta-base", "roberta-large", "roberta-large-mnli", "distilroberta-base", "roberta-base-openai-detector", "roberta-large-openai-detector", # See all RoBERTa models at https://huggingface.co/models?filter=roberta ] class RobertaEmbeddings(BertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ def __init__(self, config): super().__init__(config) self.padding_idx = config.pad_token_id self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) self.position_embeddings = nn.Embedding( config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device) else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) return super().forward( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds ) def create_position_ids_from_inputs_embeds(self, inputs_embeds): """ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. :param torch.Tensor inputs_embeds: :return torch.Tensor: """ input_shape = inputs_embeds.size()[:-1] sequence_length = input_shape[1] position_ids = torch.arange( self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device ) return position_ids.unsqueeze(0).expand(input_shape) ROBERTA_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.RobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ROBERTA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.RobertaTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", ROBERTA_START_DOCSTRING, ) class RobertaModel(BertModel): """ This class overrides :class:`~transformers1.BertModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.embeddings = RobertaEmbeddings(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) class RobertaForMaskedLM(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.roberta = RobertaModel(config) self.lm_head = RobertaLMHead(config) self.init_weights() def get_output_embeddings(self): return self.lm_head.decoder @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForMaskedLM import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMaskedLM.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) class RobertaLMHead(nn.Module): """Roberta Head for masked language modeling.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x) return x @add_start_docstrings( """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ROBERTA_START_DOCSTRING, ) class RobertaForSequenceClassification(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) self.classifier = RobertaClassificationHead(config) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForSequenceClassification import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForSequenceClassification.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, ROBERTA_START_DOCSTRING, ) class RobertaForMultipleChoice(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.roberta = RobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForMultipleChoice import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMultipleChoice.from_pretrained('roberta-base') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None outputs = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, head_mask=head_mask, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ROBERTA_START_DOCSTRING, ) class RobertaForTokenClassification(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForTokenClassification import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForTokenClassification.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) class RobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x @add_start_docstrings( """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ROBERTA_START_DOCSTRING, ) class RobertaForQuestionAnswering(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint roberta-large is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. from transformers1 import RobertaTokenizer, RobertaForQuestionAnswering import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForQuestionAnswering.from_pretrained('roberta-base') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_ids = tokenizer.encode(question, text) start_scores, end_scores = model(torch.tensor([input_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_t5.py ================================================ # coding=utf-8 # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch T5 model. """ import copy import logging import math import os import torch import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss from .configuration_t5 import T5Config from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer logger = logging.getLogger(__name__) #################################################### # This dict contrains shortcut names and associated url # for the pretrained weights provided with the models #################################################### T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", # See all T5 models at https://huggingface.co/models?filter=t5 ] #################################################### # This is a conversion method from TF 1.0 to PyTorch # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 #################################################### def load_tf_weights_in_t5(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model. """ try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) tf_weights[name] = array for txt_name in names: name = txt_name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] for n in name ): logger.info("Skipping {}".format("/".join(name))) tf_weights.pop(txt_name, None) continue if "_slot_" in name[-1]: logger.info("Skipping {}".format("/".join(name))) tf_weights.pop(txt_name, None) continue pointer = model array = tf_weights[txt_name] for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] in ["kernel", "scale", "embedding"]: pointer = getattr(pointer, "weight") # elif scope_names[0] == 'scale': # pointer = getattr(pointer, 'weight') # elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta': # pointer = getattr(pointer, 'bias') # elif scope_names[0] == 'squad': # pointer = getattr(pointer, 'classifier') else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if scope_names[0] not in ["kernel", "scale", "embedding"]: pointer = getattr(pointer, "weight") if scope_names[0] != "embedding": logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name)) array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array.astype(np.float32)) tf_weights.pop(txt_name, None) logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) return model #################################################### # PyTorch Models are constructed by sub-classing # - torch.nn.Module for the layers and # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module) #################################################### class T5LayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ Construct a layernorm module in the T5 style No bias and no substraction of mean. """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, x): # layer norm should always be calculated in float32 variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True) x = x / torch.sqrt(variance + self.variance_epsilon) if self.weight.dtype == torch.float16: x = x.to(torch.float16) return self.weight * x class T5DenseReluDense(nn.Module): def __init__(self, config): super().__init__() self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states): h = self.wi(hidden_states) h = F.relu(h) h = self.dropout(h) h = self.wo(h) return h class T5LayerFF(nn.Module): def __init__(self, config): super().__init__() self.DenseReluDense = T5DenseReluDense(config) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states): norm_x = self.layer_norm(hidden_states) y = self.DenseReluDense(norm_x) layer_output = hidden_states + self.dropout(y) return layer_output class T5Attention(nn.Module): def __init__(self, config: T5Config, has_relative_attention_bias=False): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets self.d_model = config.d_model self.d_kv = config.d_kv self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.d_kv # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) if self.has_relative_attention_bias: self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.n_heads, self.d_kv) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.q = prune_linear_layer(self.q, index) self.k = prune_linear_layer(self.k, index) self.v = prune_linear_layer(self.v, index) self.o = prune_linear_layer(self.o, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.inner_dim = self.d_kv * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) @staticmethod def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 Translate relative position to a bucket number for relative attention. The relative position is defined as memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should allow for more graceful generalization to longer sequences than the model has been trained on. Args: relative_position: an int32 Tensor bidirectional: a boolean - whether the attention is bidirectional num_buckets: an integer max_distance: an integer Returns: a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) """ ret = 0 n = -relative_position if bidirectional: num_buckets //= 2 ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets n = torch.abs(n) else: n = torch.max(n, torch.zeros_like(n)) # now n is in the range [0, inf) # half of the buckets are for exact increments in positions max_exact = num_buckets // 2 is_small = n < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance val_if_large = max_exact + ( torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) ).to(torch.long) val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) ret += torch.where(is_small, n, val_if_large) return ret def compute_bias(self, qlen, klen): """ Compute binned relative position bias """ context_position = torch.arange(qlen, dtype=torch.long)[:, None] memory_position = torch.arange(klen, dtype=torch.long)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) rp_bucket = self._relative_position_bucket( relative_position, # shape (qlen, klen) bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, ) rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) return values def forward( self, input, mask=None, kv=None, position_bias=None, past_key_value_state=None, head_mask=None, query_length=None, use_cache=False, ): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) # past_key_value_state[0] is (bs, n_heads, q_len - 1, dim_per_head) bs, qlen, dim = input.size() if past_key_value_state is not None: assert self.is_decoder is True, "Encoder cannot cache past key value states" assert ( len(past_key_value_state) == 2 ), "past_key_value_state should have 2 past states: keys and values. Got {} past states".format( len(past_key_value_state) ) real_qlen = qlen + past_key_value_state[0].shape[2] if query_length is None else query_length else: real_qlen = qlen if kv is None: klen = real_qlen else: klen = kv.size(1) def shape(x): """ projection """ return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2) def unshape(x): """ compute context """ return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim) q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) elif past_key_value_state is None: k = v = kv k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) if past_key_value_state is not None: if kv is None: k_, v_ = past_key_value_state k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) else: k, v = past_key_value_state if self.is_decoder and use_cache is True: present_key_value_state = ((k, v),) else: present_key_value_state = (None,) scores = torch.einsum("bnqd,bnkd->bnqk", q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(real_qlen, klen) # if key and values are already calculated # we want only the last query position bias if past_key_value_state is not None: position_bias = position_bias[:, :, -1:, :] if mask is not None: position_bias = position_bias + mask # (bs, n_heads, qlen, klen) scores += position_bias weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) context = self.o(context) outputs = (context,) + present_key_value_state if self.output_attentions: outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) return outputs class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, ): norm_x = self.layer_norm(hidden_states) attention_output = self.SelfAttention( norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, use_cache=use_cache, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class T5LayerCrossAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, query_length=None, ): norm_x = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, use_cache=use_cache, query_length=query_length, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class T5Block(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) if self.is_decoder: self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias)) self.layer.append(T5LayerFF(config)) def forward( self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, ): if past_key_value_state is not None: assert self.is_decoder, "Only decoder can use `past_key_value_states`" expected_num_past_key_value_states = 2 if encoder_hidden_states is None else 4 error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format( expected_num_past_key_value_states, "2 (past / key) for cross attention" if expected_num_past_key_value_states == 4 else "", len(past_key_value_state), ) assert len(past_key_value_state) == expected_num_past_key_value_states, error_message self_attn_past_key_value_state = past_key_value_state[:2] cross_attn_past_key_value_state = past_key_value_state[2:] else: self_attn_past_key_value_state, cross_attn_past_key_value_state = None, None self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=self_attn_past_key_value_state, use_cache=use_cache, ) hidden_states, present_key_value_state = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights if self.is_decoder and encoder_hidden_states is not None: # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: query_length = present_key_value_state[0].shape[2] else: query_length = None cross_attention_outputs = self.layer[1]( hidden_states, kv=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, head_mask=head_mask, past_key_value_state=cross_attn_past_key_value_state, query_length=query_length, use_cache=use_cache, ) hidden_states = cross_attention_outputs[0] # Combine self attn and cross attn key value states if present_key_value_state is not None: present_key_value_state = present_key_value_state + cross_attention_outputs[1] # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) outputs = (hidden_states,) # Add attentions if we output them outputs = outputs + (present_key_value_state,) + attention_outputs return outputs # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) class T5PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = T5Config load_tf_weights = load_tf_weights_in_t5 base_model_prefix = "transformer" @property def dummy_inputs(self): input_ids = torch.tensor(DUMMY_INPUTS) input_mask = torch.tensor(DUMMY_MASK) dummy_inputs = { "decoder_input_ids": input_ids, "input_ids": input_ids, "decoder_attention_mask": input_mask, } return dummy_inputs def _init_weights(self, module): """ Initialize the weights """ factor = self.config.initializer_factor # Used for testing weights initialization if isinstance(module, T5LayerNorm): module.weight.data.fill_(factor * 1.0) elif isinstance(module, (T5Model, T5ForConditionalGeneration)): # Mesh TensorFlow embeddings initialization # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) elif isinstance(module, T5DenseReluDense): # Mesh TensorFlow FF initialization # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) if hasattr(module.wi, "bias") and module.wi.bias is not None: module.wi.bias.data.zero_() module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) if hasattr(module.wo, "bias") and module.wo.bias is not None: module.wo.bias.data.zero_() elif isinstance(module, T5Attention): # Mesh TensorFlow attention initialization to avoid scaling before softmax # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 d_model = self.config.d_model d_kv = self.config.d_kv n_heads = self.config.num_heads module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5)) module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5)) if module.has_relative_attention_bias: module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) def _shift_right(self, input_ids): decoder_start_token_id = self.config.decoder_start_token_id pad_token_id = self.config.pad_token_id assert ( decoder_start_token_id is not None ), "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information" # shift inputs to the right shifted_input_ids = input_ids.new_zeros(input_ids.shape) shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() shifted_input_ids[..., 0] = decoder_start_token_id assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." # replace possible -100 values in lm_labels by `pad_token_id` shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) assert torch.all(shifted_input_ids >= 0).item(), "Verify that `lm_labels` has only positive values and -100" return shifted_input_ids class T5Stack(T5PreTrainedModel): def __init__(self, config, embed_tokens=None): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embed_tokens = embed_tokens self.is_decoder = config.is_decoder self.block = nn.ModuleList( [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ) self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) self.init_weights() def get_input_embeddings(self): return self.embed_tokens def get_output_embeddings(self): return self.embed_tokens def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings def forward( self, input_ids=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, inputs_embeds=None, head_mask=None, past_key_value_states=None, use_cache=False, ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: if self.is_decoder: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape if past_key_value_states is not None: assert seq_length == 1, "Input shape is {}, but should be {} when using past_key_value_sates".format( input_shape, (batch_size, 1) ) # required mask seq length can be calculated via length of past # key value states and seq_length = 1 for the last token mask_seq_length = past_key_value_states[0][0].shape[2] + seq_length else: mask_seq_length = seq_length if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = torch.ones( batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long ) # initialize past_key_value_states with `None` if past does not exist if past_key_value_states is None: past_key_value_states = [None] * len(self.block) # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device) if self.is_decoder and encoder_attention_mask is not None: encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) present_key_value_states = () all_hidden_states = () all_attentions = () position_bias = None encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask=extended_attention_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i], past_key_value_state=past_key_value_state, use_cache=use_cache, ) # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states, present_key_value_state = layer_outputs[:2] if i == 0: # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) position_bias = layer_outputs[3 if self.output_attentions else 2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[5 if self.output_attentions else 3] # append next layer key value states present_key_value_states = present_key_value_states + (present_key_value_state,) if self.output_attentions: all_attentions = all_attentions + (layer_outputs[2],) # We keep only self-attention weights for now hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) outputs = outputs + (present_key_value_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (presents,) (all hidden states), (all attentions) T5_START_DOCSTRING = r""" The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting. This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`: https://arxiv.org/abs/1910.10683 .. _`torch.nn.Module`: https://pytorch.org/docs/stable/nn.html#module Parameters: config (:class:`~transformers1.T5Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ T5_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you should be able to pad the inputs on both the right and the left. Indices can be obtained using :class:`transformers1.T5Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. To know more on how to prepare :obj:`input_ids` for pre-training take a look at `T5 Training <./t5.html#training>`_ . attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation. If `decoder_past_key_value_states` is used, optionally only the last `decoder_input_ids` have to be input (see `decoder_past_key_value_states`). To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at `T5 Training <./t5.html#training>`_ . decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up decoding. If `decoder_past_key_value_states` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all `decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): If `use_cache` is True, `decoder_past_key_value_states` are returned and can be used to speed up decoding (see `decoder_past_key_value_states`). inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. If `decoder_past_key_value_states` is used, optionally only the last `decoder_inputs_embeds` have to be input (see `decoder_past_key_value_states`). This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. head_mask: (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ @add_start_docstrings( "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.", T5_START_DOCSTRING, ) class T5Model(T5PreTrainedModel): def __init__(self, config): super().__init__(config) self.shared = nn.Embedding(config.vocab_size, config.d_model) encoder_config = copy.deepcopy(config) self.encoder = T5Stack(encoder_config, self.shared) decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = T5Stack(decoder_config, self.shared) self.init_weights() def get_input_embeddings(self): return self.shared def set_input_embeddings(self, new_embeddings): self.shared = new_embeddings self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_past_key_value_states=None, use_cache=True, inputs_embeds=None, decoder_inputs_embeds=None, head_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import T5Tokenizer, T5Model tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5Model.from_pretrained('t5-small') input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # Encode if needed (training, first prediction pass) if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask ) hidden_states = encoder_outputs[0] # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] return decoder_outputs + encoder_outputs @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): def __init__(self, config): super().__init__(config) self.model_dim = config.d_model self.shared = nn.Embedding(config.vocab_size, config.d_model) encoder_config = copy.deepcopy(config) self.encoder = T5Stack(encoder_config, self.shared) decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = T5Stack(decoder_config, self.shared) self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) self.init_weights() def get_input_embeddings(self): return self.shared def set_input_embeddings(self, new_embeddings): self.shared = new_embeddings self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) def get_output_embeddings(self): return self.lm_head def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_past_key_value_states=None, use_cache=True, lm_labels=None, inputs_embeds=None, decoder_inputs_embeds=None, head_mask=None, ): r""" lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): Classification loss (cross entropy). prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). If `past_key_value_states` is used only the last prediction_scores of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention. Examples:: from transformers1 import T5Tokenizer, T5ForConditionalGeneration tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids) loss, prediction_scores = outputs[:2] tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1 outputs = model.generate(input_ids) """ # Encode if needed (training, first prediction pass) if encoder_outputs is None: # Convert encoder inputs in embeddings if needed encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask ) hidden_states = encoder_outputs[0] if lm_labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right decoder_input_ids = self._shift_right(lm_labels) # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: assert lm_labels is None, "Decoder should not use cached key value states when training." if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) # insert decoder past at right place # to speed up decoding if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] sequence_output = decoder_outputs[0] # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 sequence_output = sequence_output * (self.model_dim ** -0.5) lm_logits = self.lm_head(sequence_output) decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here if lm_labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-100) loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 decoder_outputs = (loss,) + decoder_outputs return decoder_outputs + encoder_outputs def prepare_inputs_for_generation(self, input_ids, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step if len(past) < 2: encoder_outputs, decoder_past_key_value_states = past, None else: encoder_outputs, decoder_past_key_value_states = past[0], past[1] return { "decoder_input_ids": input_ids, "decoder_past_key_value_states": decoder_past_key_value_states, "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "use_cache": use_cache, } def _reorder_cache(self, past, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder if len(past) < 2: logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") return past decoder_past = past[1] past = (past[0],) reordered_decoder_past = () for layer_past_states in decoder_past: # get the correct batch idx from layer past batch dim # batch dim of `past` is at 2nd position reordered_layer_past_states = () for layer_past_state in layer_past_states: # need to set correct `past` for each of the four key / value states reordered_layer_past_states = reordered_layer_past_states + ( layer_past_state.index_select(0, beam_idx), ) assert reordered_layer_past_states[0].shape == layer_past_states[0].shape assert len(reordered_layer_past_states) == len(layer_past_states) reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return past + (reordered_decoder_past,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_albert.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 ALBERT model. """ import logging import tensorflow as tf from .configuration_albert import AlbertConfig from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "albert-base-v1", "albert-large-v1", "albert-xlarge-v1", "albert-xxlarge-v1", "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", # See all ALBERT models at https://huggingface.co/models?filter=albert ] class TFAlbertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.config = config self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.embedding_size, embeddings_initializer=get_initializer(self.config.initializer_range), name="position_embeddings", ) self.token_type_embeddings = tf.keras.layers.Embedding( config.type_vocab_size, config.embedding_size, embeddings_initializer=get_initializer(self.config.initializer_range), name="token_type_embeddings", ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.config.vocab_size, self.config.embedding_size], initializer=get_initializer(self.config.initializer_range), ) super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, embedding_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.config.embedding_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.config.vocab_size]) class TFAlbertSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) self.key = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) self.value = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # scale attention_scores dk = tf.cast(shape_list(key_layer)[-1], tf.float32) attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs, training=training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape( context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class TFAlbertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, input_tensor = inputs hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class TFAlbertAttention(TFBertSelfAttention): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.hidden_size = config.hidden_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): input_tensor, attention_mask, head_mask = inputs batch_size = shape_list(input_tensor)[0] mixed_query_layer = self.query(input_tensor) mixed_key_layer = self.key(input_tensor) mixed_value_layer = self.value(input_tensor) query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # scale attention_scores dk = tf.cast(shape_list(key_layer)[-1], tf.float32) attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs, training=training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape( context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) hidden_states = self_outputs[0] hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) attention_output = self.LayerNorm(hidden_states + input_tensor) # add attentions if we output them outputs = (attention_output,) + self_outputs[1:] return outputs class TFAlbertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.attention = TFAlbertAttention(config, name="attention") self.ffn = tf.keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn" ) if isinstance(config.hidden_act, str): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act self.ffn_output = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output" ) self.full_layer_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="full_layer_layer_norm" ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) ffn_output = self.ffn(attention_outputs[0]) ffn_output = self.activation(ffn_output) ffn_output = self.ffn_output(ffn_output) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0]) # add attentions if we output them outputs = (hidden_states,) + attention_outputs[1:] return outputs class TFAlbertLayerGroup(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.albert_layers = [ TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num) ] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs layer_hidden_states = () layer_attentions = () for layer_index, albert_layer in enumerate(self.albert_layers): layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training) hidden_states = layer_output[0] if self.output_attentions: layer_attentions = layer_attentions + (layer_output[1],) if self.output_hidden_states: layer_hidden_states = layer_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (layer_hidden_states,) if self.output_attentions: outputs = outputs + (layer_attentions,) # last-layer hidden state, (layer hidden states), (layer attentions) return outputs class TFAlbertTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.config = config self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embedding_hidden_mapping_in = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="embedding_hidden_mapping_in", ) self.albert_layer_groups = [ TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups) ] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_attentions = () if self.output_hidden_states: all_hidden_states = (hidden_states,) for i in range(self.config.num_hidden_layers): # Number of layers in a hidden group layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups) # Index of the hidden group group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) layer_group_output = self.albert_layer_groups[group_idx]( [ hidden_states, attention_mask, head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], ], training=training, ) hidden_states = layer_group_output[0] if self.output_attentions: all_attentions = all_attentions + layer_group_output[-1] if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) # last-layer hidden state, (all hidden states), (all attentions) return outputs class TFAlbertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = AlbertConfig base_model_prefix = "albert" class TFAlbertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dense = tf.keras.layers.Dense( config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.decoder_bias = self.add_weight( shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" ) super().build(input_shape) def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.LayerNorm(hidden_states) hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias return hidden_states @keras_serializable class TFAlbertMainLayer(tf.keras.layers.Layer): config_class = AlbertConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.encoder = TFAlbertTransformer(config, name="encoder") self.pooler = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="pooler", ) def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output[:, 0]) # add hidden_states and attentions if they are here outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # sequence_output, pooled_output, (hidden_states), (attentions) return outputs ALBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`: https://arxiv.org/abs/1909.11942 .. _`tf.keras.Model`: https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Args: config (:class:`~transformers1.AlbertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ALBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.AlbertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.", ALBERT_START_DOCSTRING, ) class TFAlbertModel(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, name="albert") @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Albert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertModel tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertModel.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.albert(inputs, **kwargs) return outputs @add_start_docstrings( """Albert Model with two heads on top for pre-training: a `masked language modeling` head and a `sentence order prediction` (classification) head. """, ALBERT_START_DOCSTRING, ) class TFAlbertForPreTraining(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") def get_output_embeddings(self): return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForPreTraining tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForPreTraining.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, sop_scores = outputs[:2] """ outputs = self.albert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False)) outputs = (prediction_scores, sop_scores) + outputs[2:] return outputs class TFAlbertSOPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) def call(self, pooled_output, training: bool): dropout_pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(dropout_pooled_output) return logits @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING) class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, name="albert") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") def get_output_embeddings(self): return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForMaskedLM tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.albert(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False)) # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] return outputs # prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ALBERT_START_DOCSTRING, ) class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForSequenceClassification tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ outputs = self.albert(inputs, **kwargs) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ALBERT_START_DOCSTRING, ) class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForQuestionAnswering tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2') input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet") start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1 all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1]) """ outputs = self.albert(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, ALBERT_START_DOCSTRING, ) class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, name="albert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property def dummy_inputs(self): """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForMultipleChoice tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2') example1 = ["This is a context", "Is it a context? Yes"] example2 = ["This is a context", "Is it a context? No"] encoding = tokenizer.batch_encode_plus([example1, example2], return_tensors='tf', truncation_strategy="only_first", pad_to_max_length=True, max_length=128) outputs = model(encoding["input_ids"][None, :]) logits = outputs[0] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): print("isdict(1)") input_ids = inputs.get("input_ids") print(input_ids) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None: num_choices = shape_list(input_ids)[1] seq_length = shape_list(input_ids)[2] else: num_choices = shape_list(inputs_embeds)[1] seq_length = shape_list(inputs_embeds)[2] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, ] outputs = self.albert(flat_inputs, training=training) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # reshaped_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Model class. """ import logging from collections import OrderedDict from .configuration_auto import ( AlbertConfig, AutoConfig, BertConfig, CTRLConfig, DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, T5Config, TransfoXLConfig, XLMConfig, XLNetConfig, ) from .configuration_utils import PretrainedConfig from .modeling_tf_albert import ( TFAlbertForMaskedLM, TFAlbertForMultipleChoice, TFAlbertForPreTraining, TFAlbertForQuestionAnswering, TFAlbertForSequenceClassification, TFAlbertModel, ) from .modeling_tf_bert import ( TFBertForMaskedLM, TFBertForMultipleChoice, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, TFBertForTokenClassification, TFBertModel, ) from .modeling_tf_ctrl import TFCTRLLMHeadModel, TFCTRLModel from .modeling_tf_distilbert import ( TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TFDistilBertModel, ) from .modeling_tf_gpt2 import TFGPT2LMHeadModel, TFGPT2Model from .modeling_tf_openai import TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel from .modeling_tf_roberta import ( TFRobertaForMaskedLM, TFRobertaForQuestionAnswering, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaModel, ) from .modeling_tf_t5 import TFT5ForConditionalGeneration, TFT5Model from .modeling_tf_transfo_xl import TFTransfoXLLMHeadModel, TFTransfoXLModel from .modeling_tf_xlm import ( TFXLMForQuestionAnsweringSimple, TFXLMForSequenceClassification, TFXLMModel, TFXLMWithLMHeadModel, ) from .modeling_tf_xlnet import ( TFXLNetForQuestionAnsweringSimple, TFXLNetForSequenceClassification, TFXLNetForTokenClassification, TFXLNetLMHeadModel, TFXLNetModel, ) logger = logging.getLogger(__name__) TF_MODEL_MAPPING = OrderedDict( [ (T5Config, TFT5Model), (DistilBertConfig, TFDistilBertModel), (AlbertConfig, TFAlbertModel), (RobertaConfig, TFRobertaModel), (BertConfig, TFBertModel), (OpenAIGPTConfig, TFOpenAIGPTModel), (GPT2Config, TFGPT2Model), (TransfoXLConfig, TFTransfoXLModel), (XLNetConfig, TFXLNetModel), (XLMConfig, TFXLMModel), (CTRLConfig, TFCTRLModel), ] ) TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( [ (T5Config, TFT5ForConditionalGeneration), (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForPreTraining), (RobertaConfig, TFRobertaForMaskedLM), (BertConfig, TFBertForPreTraining), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (GPT2Config, TFGPT2LMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel), (XLNetConfig, TFXLNetLMHeadModel), (XLMConfig, TFXLMWithLMHeadModel), (CTRLConfig, TFCTRLLMHeadModel), ] ) TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ (T5Config, TFT5ForConditionalGeneration), (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM), (RobertaConfig, TFRobertaForMaskedLM), (BertConfig, TFBertForMaskedLM), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (GPT2Config, TFGPT2LMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel), (XLNetConfig, TFXLNetLMHeadModel), (XLMConfig, TFXLMWithLMHeadModel), (CTRLConfig, TFCTRLLMHeadModel), ] ) TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, TFDistilBertForSequenceClassification), (AlbertConfig, TFAlbertForSequenceClassification), (RobertaConfig, TFRobertaForSequenceClassification), (BertConfig, TFBertForSequenceClassification), (XLNetConfig, TFXLNetForSequenceClassification), (XLMConfig, TFXLMForSequenceClassification), ] ) TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [(BertConfig, TFBertForMultipleChoice), (AlbertConfig, TFAlbertForMultipleChoice)] ) TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ (DistilBertConfig, TFDistilBertForQuestionAnswering), (AlbertConfig, TFAlbertForQuestionAnswering), (RobertaConfig, TFRobertaForQuestionAnswering), (BertConfig, TFBertForQuestionAnswering), (XLNetConfig, TFXLNetForQuestionAnsweringSimple), (XLMConfig, TFXLMForQuestionAnsweringSimple), ] ) TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, TFDistilBertForTokenClassification), (RobertaConfig, TFRobertaForTokenClassification), (BertConfig, TFBertForTokenClassification), (XLNetConfig, TFXLNetForTokenClassification), ] ) class TFAutoModel(object): r""" :class:`~transformers1.TFAutoModel` is a generic model class that will be instantiated as one of the base model classes of the library when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5Model (T5 model) - `distilbert`: TFDistilBertModel (DistilBERT model) - `roberta`: TFRobertaModel (RoBERTa model) - `bert`: TFBertModel (Bert model) - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model) - `gpt2`: TFGPT2Model (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLModel (Transformer-XL model) - `xlnet`: TFXLNetModel (XLNet model) - `xlm`: TFXLMModel (XLM model) - `ctrl`: TFCTRLModel (CTRL model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModel is designed to be instantiated " "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModel.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model) - isInstance of `bert` configuration class: TFBertModel (Bert model) - isInstance of `openai-gpt` configuration class: TFOpenAIGPTModel (OpenAI GPT model) - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: TFTransfoXLModel (Transformer-XL model) - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model) - isInstance of `xlm` configuration class: TFXLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the base model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5Model (T5 model) - `distilbert`: TFDistilBertModel (DistilBERT model) - `roberta`: TFRobertaModel (RoBERTa model) - `bert`: TFTFBertModel (Bert model) - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model) - `gpt2`: TFGPT2Model (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLModel (Transformer-XL model) - `xlnet`: TFXLNetModel (XLNet model) - `ctrl`: TFCTRLModel (CTRL model) Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_MAPPING.keys()) ) ) class TFAutoModelForPreTraining(object): r""" :class:`~transformers1.TFAutoModelForPreTraining` is a generic model class that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForPreTraining is designed to be instantiated " "using the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForPreTraining.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.TFDistilBertModelForMaskedLM` (DistilBERT model) - isInstance of `roberta` configuration class: :class:`~transformers1.TFRobertaModelForMaskedLM` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.TFBertForPreTraining` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.TFOpenAIGPTLMHeadModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.TFGPT2ModelLMHeadModel` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.TFCTRLModelLMHeadModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TFTransfoXLLMHeadModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.TFXLNetLMHeadModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.TFXLMWithLMHeadModel` (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.TFT5ModelWithLMHead` (T5 model) - `distilbert`: :class:`~transformers1.TFDistilBertForMaskedLM` (DistilBERT model) - `albert`: :class:`~transformers1.TFAlbertForPreTraining` (ALBERT model) - `roberta`: :class:`~transformers1.TFRobertaForMaskedLM` (RoBERTa model) - `bert`: :class:`~transformers1.TFBertForPreTraining` (Bert model) - `openai-gpt`: :class:`~transformers1.TFOpenAIGPTLMHeadModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.TFGPT2LMHeadModel` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TFTransfoXLLMHeadModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.TFXLNetLMHeadModel` (XLNet model) - `xlm`: :class:`~transformers1.TFXLMWithLMHeadModel` (XLM model) - `ctrl`: :class:`~transformers1.TFCTRLLMHeadModel` (Salesforce CTRL model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForPreTraining.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForPreTraining.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForPreTraining.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForPreTraining.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) class TFAutoModelWithLMHead(object): r""" :class:`~transformers1.TFAutoModelWithLMHead` is a generic model class that will be instantiated as one of the language modeling model classes of the library when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5ForConditionalGeneration (T5 model) - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - `roberta`: TFRobertaForMaskedLM (RoBERTa model) - `bert`: TFBertForMaskedLM (Bert model) - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model) - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model) - `xlnet`: TFXLNetLMHeadModel (XLNet model) - `xlm`: TFXLMWithLMHeadModel (XLM model) - `ctrl`: TFCTRLLMHeadModel (CTRL model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelWithLMHead is designed to be instantiated " "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelWithLMHead.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model) - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `xlm` configuration class: XLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the language modeling model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5ForConditionalGeneration (T5 model) - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - `roberta`: TFRobertaForMaskedLM (RoBERTa model) - `bert`: TFBertForMaskedLM (Bert model) - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model) - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model) - `xlnet`: TFXLNetLMHeadModel (XLNet model) - `xlm`: TFXLMWithLMHeadModel (XLM model) - `ctrl`: TFCTRLLMHeadModel (CTRL model) Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) class TFAutoModelForMultipleChoice: r""" :class:`~transformers1.TFAutoModelForMultipleChoice` is a generic model class that will be instantiated as one of the multiple choice model classes of the library when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `albert`: TFAlbertForMultipleChoice (Albert model) - `bert`: TFBertForMultipleChoice (Bert model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForMultipleChoice is designed to be instantiated " "using the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForMultipleChoice.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `albert` configuration class: AlbertModel (Albert model) - isInstance of `bert` configuration class: BertModel (Bert model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForMulitpleChoice.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the multiple choice model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `albert`: TFRobertaForMultiple (Albert model) - `bert`: TFBertForMultipleChoice (Bert model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelFormultipleChoice.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelFormultipleChoice.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelFormultipleChoice.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelFormultipleChoice.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) class TFAutoModelForSequenceClassification(object): r""" :class:`~transformers1.TFAutoModelForSequenceClassification` is a generic model class that will be instantiated as one of the sequence classification model classes of the library when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model) - `roberta`: TFRobertaForSequenceClassification (RoBERTa model) - `bert`: TFBertForSequenceClassification (Bert model) - `xlnet`: TFXLNetForSequenceClassification (XLNet model) - `xlm`: TFXLMForSequenceClassification (XLM model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForSequenceClassification is designed to be instantiated " "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForSequenceClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `xlm` configuration class: XLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the sequence classification model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model) - `roberta`: TFRobertaForSequenceClassification (RoBERTa model) - `bert`: TFBertForSequenceClassification (Bert model) - `xlnet`: TFXLNetForSequenceClassification (XLNet model) - `xlm`: TFXLMForSequenceClassification (XLM model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) class TFAutoModelForQuestionAnswering(object): r""" :class:`~transformers1.TFAutoModelForQuestionAnswering` is a generic model class that will be instantiated as one of the question answering model classes of the library when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model) - `albert`: TFAlbertForQuestionAnswering (ALBERT model) - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model) - `bert`: TFBertForQuestionAnswering (Bert model) - `xlnet`: TFXLNetForQuestionAnswering (XLNet model) - `xlm`: TFXLMForQuestionAnswering (XLM model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForQuestionAnswering is designed to be instantiated " "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForQuestionAnswering.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `albert` configuration class: AlbertModel (ALBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `xlm` configuration class: XLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelForQuestionAnswering.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model) - `albert`: TFAlbertForQuestionAnswering (ALBERT model) - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model) - `bert`: TFBertForQuestionAnswering (Bert model) - `xlnet`: TFXLNetForQuestionAnswering (XLNet model) - `xlm`: TFXLMForQuestionAnswering (XLM model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) class TFAutoModelForTokenClassification: def __init__(self): raise EnvironmentError( "TFAutoModelForTokenClassification is designed to be instantiated " "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForTokenClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `distilbert` configuration class: DistilBertModel (DistilBert model) - isInstance of `roberta` configuration class: RobteraModel (Roberta model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelForTokenClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `bert`: BertForTokenClassification (Bert model) - `xlnet`: XLNetForTokenClassification (XLNet model) - `distilbert`: DistilBertForTokenClassification (DistilBert model) - `roberta`: RobertaForTokenClassification (Roberta model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForTokenClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 BERT model. """ import logging import numpy as np import tensorflow as tf from .configuration_bert import BertConfig from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased", "bert-base-chinese", "bert-base-german-cased", "bert-large-uncased-whole-word-masking", "bert-large-cased-whole-word-masking", "bert-large-uncased-whole-word-masking-finetuned-squad", "bert-large-cased-whole-word-masking-finetuned-squad", "bert-base-cased-finetuned-mrpc", "cl-tohoku/bert-base-japanese", "cl-tohoku/bert-base-japanese-whole-word-masking", "cl-tohoku/bert-base-japanese-char", "cl-tohoku/bert-base-japanese-char-whole-word-masking", "TurkuNLP/bert-base-finnish-cased-v1", "TurkuNLP/bert-base-finnish-uncased-v1", "wietsedv/bert-base-dutch-cased", # See all BERT models at https://huggingface.co/models?filter=bert ] def gelu(x): """ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf def gelu_new(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf def swish(x): return x * tf.sigmoid(x) ACT2FN = { "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.layers.Activation(swish), "gelu_new": tf.keras.layers.Activation(gelu_new), } class TFBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size self.initializer_range = config.initializer_range self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.hidden_size, embeddings_initializer=get_initializer(self.initializer_range), name="position_embeddings", ) self.token_type_embeddings = tf.keras.layers.Embedding( config.type_vocab_size, config.hidden_size, embeddings_initializer=get_initializer(self.initializer_range), name="token_type_embeddings", ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range), ) super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, hidden_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.hidden_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.vocab_size]) class TFBertSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) self.key = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) self.value = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = tf.matmul( query_layer, key_layer, transpose_b=True ) # (batch size, num_heads, seq_len_q, seq_len_k) dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs, training=training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape( context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class TFBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, input_tensor = inputs hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class TFBertAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.self_attention = TFBertSelfAttention(config, name="self") self.dense_output = TFBertSelfOutput(config, name="output") def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): input_tensor, attention_mask, head_mask = inputs self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training) attention_output = self.dense_output([self_outputs[0], input_tensor], training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class TFBertIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class TFBertOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, input_tensor = inputs hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class TFBertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.attention = TFBertAttention(config, name="attention") self.intermediate = TFBertIntermediate(config, name="intermediate") self.bert_output = TFBertOutput(config, name="output") def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) attention_output = attention_outputs[0] intermediate_output = self.intermediate(attention_output) layer_output = self.bert_output([intermediate_output, attention_output], training=training) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs class TFBertEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # outputs, (hidden states), (attentions) class TFBertPooler(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) def call(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) return pooled_output class TFBertPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class TFBertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.transform = TFBertPredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states class TFBertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") def call(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class TFBertNSPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.seq_relationship = tf.keras.layers.Dense( 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" ) def call(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) return seq_relationship_score @keras_serializable class TFBertMainLayer(tf.keras.layers.Layer): config_class = BertConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFBertEmbeddings(config, name="embeddings") self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) class TFBertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig base_model_prefix = "bert" BERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ BERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`__ position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class TFBertModel(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertModel tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertModel.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.bert(inputs, **kwargs) return outputs @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class TFBertForPreTraining(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") def get_output_embeddings(self): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForPreTraining tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForPreTraining.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) seq_relationship_score = self.nsp(pooled_output) outputs = (prediction_scores, seq_relationship_score,) + outputs[ 2: ] # add hidden states and attention if they are here return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class TFBertForMaskedLM(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") def get_output_embeddings(self): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForMaskedLM tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class TFBertForNextSentencePrediction(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForNextSentencePrediction tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." next_sentence = "The sky is blue due to the shorter wavelength of blue light." encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf') logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] assert logits[0][0] < logits[0][1] # the next sentence was random """ outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] seq_relationship_score = self.nsp(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here return outputs # seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class TFBertForSequenceClassification(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForSequenceClassification tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class TFBertForMultipleChoice(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property def dummy_inputs(self): """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForMultipleChoice tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." choice0 = "It is eaten with a fork and a knife." choice1 = "It is eaten while held in the hand." encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='tf', pad_to_max_length=True) # linear classifier on the output is not yet trained outputs = model(encoding['input_ids'][None, :]) logits = outputs[0] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None: num_choices = shape_list(input_ids)[1] seq_length = shape_list(input_ids)[2] else: num_choices = shape_list(inputs_embeds)[1] seq_length = shape_list(inputs_embeds)[2] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, ] outputs = self.bert(flat_inputs, training=training) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class TFBertForTokenClassification(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForTokenClassification tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class TFBertForQuestionAnswering(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForQuestionAnswering tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :]) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1]) assert answer == "a nice puppet" """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_camembert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 CamemBERT model. """ import logging from .configuration_camembert import CamembertConfig from .file_utils import add_start_docstrings from .modeling_tf_roberta import ( TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaModel, ) logger = logging.getLogger(__name__) TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all CamemBERT models at https://huggingface.co/models?filter=camembert ] CAMEMBERT_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.CamembertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", CAMEMBERT_START_DOCSTRING, ) class TFCamembertModel(TFRobertaModel): """ This class overrides :class:`~transformers1.TFRobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, ) class TFCamembertForMaskedLM(TFRobertaForMaskedLM): """ This class overrides :class:`~transformers1.TFRobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, CAMEMBERT_START_DOCSTRING, ) class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification): """ This class overrides :class:`~transformers1.TFRobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, CAMEMBERT_START_DOCSTRING, ) class TFCamembertForTokenClassification(TFRobertaForTokenClassification): """ This class overrides :class:`~transformers1.TFRobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 CTRL model.""" import logging import numpy as np import tensorflow as tf from .configuration_ctrl import CTRLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "ctrl" # See all CTRL models at https://huggingface.co/models?filter=ctrl ] def angle_defn(pos, i, d_model_size): angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size)) return pos * angle_rates def positional_encoding(position, d_model_size): # create the sinusoidal pattern for the positional encoding angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size) sines = np.sin(angle_rads[:, 0::2]) cosines = np.cos(angle_rads[:, 1::2]) # pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1)[np.newaxis, ...], dtype=tf.float32) pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32) return pos_encoding def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): # calculate attention matmul_qk = tf.matmul(q, k, transpose_b=True) dk = tf.cast(shape_list(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) if mask is not None: scaled_attention_logits += mask * -1e4 if attention_mask is not None: # Apply the attention mask scaled_attention_logits = scaled_attention_logits + attention_mask attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # Mask heads if we want to if head_mask is not None: attention_weights = attention_weights * head_mask output = tf.matmul(attention_weights, v) return output, attention_weights class TFMultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): super().__init__(**kwargs) self.output_attentions = output_attentions self.num_heads = num_heads self.d_model_size = d_model_size self.depth = int(d_model_size / self.num_heads) self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq") self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk") self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv") self.dense = tf.keras.layers.Dense(d_model_size, name="dense") def split_into_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, inputs, training=False): v, k, q, mask, layer_past, attention_mask, head_mask, use_cache = inputs batch_size = shape_list(q)[0] q = self.Wq(q) k = self.Wk(k) v = self.Wv(v) q = self.split_into_heads(q, batch_size) k = self.split_into_heads(k, batch_size) v = self.split_into_heads(v, batch_size) if layer_past is not None: past_key, past_value = tf.unstack(layer_past, axis=0) k = tf.concat((past_key, k), axis=-2) v = tf.concat((past_value, v), axis=-2) # to cope with keras serialization # we need to cast `use_cache` to correct bool # if it is a tensor if tf.is_tensor(use_cache): if hasattr(use_cache, "numpy"): use_cache = bool(use_cache.numpy()) else: use_cache = True if use_cache is True: present = tf.stack((k, v), axis=0) else: present = (None,) output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3]) attn = output[1] original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size)) output = self.dense(original_size_attention) outputs = (output, present) if self.output_attentions: outputs = outputs + (attn,) return outputs def point_wise_feed_forward_network(d_model_size, dff, name=""): return tf.keras.Sequential( [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")], name="ffn", ) class TFEncoderLayer(tf.keras.layers.Layer): def __init__( self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs ): super().__init__(**kwargs) self.multi_head_attention = TFMultiHeadAttention( d_model_size, num_heads, output_attentions, name="multi_head_attention" ) self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn") self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) def call(self, inputs, training=False): x, mask, layer_past, attention_mask, head_mask, use_cache = inputs normed = self.layernorm1(x) attn_outputs = self.multi_head_attention( [normed, normed, normed, mask, layer_past, attention_mask, head_mask, use_cache], training=training ) attn_output = attn_outputs[0] attn_output = self.dropout1(attn_output, training=training) out1 = x + attn_output out2 = self.layernorm2(out1) ffn_output = self.ffn(out2) ffn_output = self.dropout2(ffn_output, training=training) out2 = out1 + ffn_output outputs = (out2,) + attn_outputs[1:] return outputs @keras_serializable class TFCTRLMainLayer(tf.keras.layers.Layer): config_class = CTRLConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.d_model_size = config.n_embd self.num_layers = config.n_layer self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size) self.w = TFSharedEmbeddings( config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w" ) self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [ TFEncoderLayer( config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.layer_norm_epsilon, config.output_attentions, name="h_._{}".format(i), ) for i in range(config.n_layer) ] self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") def get_input_embeddings(self): return self.w def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ raise NotImplementedError def call( self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past attention_mask = inputs[2] if len(inputs) > 2 else attention_mask token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds use_cache = inputs[7] if len(inputs) > 7 else use_cache assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs # If using past key value states, only the last tokens # should be given as an input if past is not None: if input_ids is not None: input_ids = input_ids[:, -1:] if inputs_embeds is not None: inputs_embeds = inputs_embeds[:, -1:] if token_type_ids is not None: token_type_ids = token_type_ids[:, -1:] if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = shape_list(past[0][0])[-2] if position_ids is None: position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.tile(position_ids, [input_shape[0], 1]) # Attention mask. if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = tf.cast(attention_mask, tf.float32) attention_mask = (1.0 - attention_mask) * -10000.0 else: attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # head_mask has shape n_layer x batch x n_heads x N x N if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_layers if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_embeds = self.w(token_type_ids, mode="embedding") token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32)) else: token_type_embeds = 0 position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: inputs_embeds = self.w(input_ids, mode="embedding") seq_len = input_shape[-1] mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32)) pos_embeds = tf.gather(self.pos_encoding, position_ids) hidden_states = inputs_embeds + pos_embeds + token_type_embeds hidden_states = self.dropout(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] presents = () all_hidden_states = () all_attentions = [] for i, (h, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = h([hidden_states, mask, layer_past, attention_mask, head_mask[i], use_cache], training=training) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.layernorm(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs class TFCTRLPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = CTRLConfig base_model_prefix = "transformer" CTRL_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.CTRLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ CTRL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only input_ids that do not have their past calculated should be passed as input_ids (see `past`). Indices can be obtained using :class:`transformers1.CTRLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. use_cache (:obj:`bool`): If `use_cache` is True, `past` key value states are returned and can be used to speed up decoding (see `past`). Defaults to `True`. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", CTRL_START_DOCSTRING, ) class TFCTRLModel(TFCTRLPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFCTRLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import CTRLTokenizer, TFCTRLModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = TFCTRLModel.from_pretrained('ctrl') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs class TFCTRLLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, ) class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFCTRLMainLayer(config, name="transformer") self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head") def get_output_embeddings(self): return self.lm_head.input_embeddings def prepare_inputs_for_generation(self, inputs, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: inputs = tf.expand_dims(inputs[:, -1], -1) return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import CTRLTokenizer, TFCTRLLMHeadModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = TFCTRLLMHeadModel.from_pretrained('ctrl') input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)]) outputs = model(input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] return outputs # lm_logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 DistilBERT model """ import logging import math import numpy as np import tensorflow as tf from .configuration_distilbert import DistilBertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "distilbert-base-uncased", "distilbert-base-uncased-distilled-squad", "distilbert-base-cased", "distilbert-base-cased-distilled-squad", "distilbert-base-multilingual-cased", "distilbert-base-uncased-finetuned-sst-2-english", # See all DistilBERT models at https://huggingface.co/models?filter=distilbert ] # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def gelu(x): """ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf def gelu_new(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf class TFEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dim = config.dim self.initializer_range = config.initializer_range self.word_embeddings = TFSharedEmbeddings( config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings" ) # padding_idx=0) self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.dim, embeddings_initializer=get_initializer(config.initializer_range), name="position_embeddings", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range) ) super().build(input_shape) def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, inputs_embeds=inputs_embeds, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, inputs_embeds=None, training=False): """ Parameters ---------- input_ids: tf.Tensor(bs, max_seq_length) The token ids to embed. Outputs ------- embeddings: tf.Tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ if not isinstance(inputs, (tuple, list)): input_ids = inputs position_ids = None else: input_ids, position_ids = inputs if input_ids is not None: seq_length = shape_list(input_ids)[1] else: seq_length = shape_list(inputs_embeds)[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, hidden_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.dim]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.vocab_size]) class TFMultiHeadSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0 self.q_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin" ) self.k_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin" ) self.v_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin" ) self.out_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin" ) self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): """ Parameters ---------- query: tf.Tensor(bs, seq_length, dim) key: tf.Tensor(bs, seq_length, dim) value: tf.Tensor(bs, seq_length, dim) mask: tf.Tensor(bs, seq_length) Outputs ------- weights: tf.Tensor(bs, n_heads, seq_length, seq_length) Attention weights context: tf.Tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ query, key, value, mask, head_mask = inputs bs, q_length, dim = shape_list(query) k_length = shape_list(key)[1] # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) # assert key.size() == value.size() dim_per_head = self.dim // self.n_heads mask_reshape = [bs, 1, 1, k_length] def shape(x): """ separate heads """ return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) def unshape(x): """ group heads """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length) mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length) scores = scores - 1e30 * (1.0 - mask) weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) if self.output_attentions: return (context, weights) else: return (context,) class TFFFN(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dropout = tf.keras.layers.Dropout(config.dropout) self.lin1 = tf.keras.layers.Dense( config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" ) self.lin2 = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" ) assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format( config.activation ) self.activation = ( tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu ) def call(self, input, training=False): x = self.lin1(input) x = self.activation(x) x = self.lin2(x) x = self.dropout(x, training=training) return x class TFTransformerBlock(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim self.hidden_dim = config.hidden_dim self.dropout = tf.keras.layers.Dropout(config.dropout) self.activation = config.activation self.output_attentions = config.output_attentions assert config.dim % config.n_heads == 0 self.attention = TFMultiHeadSelfAttention(config, name="attention") self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm") self.ffn = TFFFN(config, name="ffn") self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") def call(self, inputs, training=False): # removed: src_enc=None, src_len=None """ Parameters ---------- x: tf.Tensor(bs, seq_length, dim) attn_mask: tf.Tensor(bs, seq_length) Outputs ------- sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: tf.Tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ x, attn_mask, head_mask = inputs # Self-Attention sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training) if self.output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples # assert type(sa_output) == tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) if self.output_attentions: output = (sa_weights,) + output return output class TFTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_layers = config.n_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] def call(self, inputs, training=False): """ Parameters ---------- x: tf.Tensor(bs, seq_length, dim) Input sequence embedded. attn_mask: tf.Tensor(bs, seq_length) Attention mask on the sequence. Outputs ------- hidden_state: tf.Tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top) layer all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)] Tuple of length n_layers with the hidden states from each layer. Optional: only if output_hidden_states=True all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)] Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ x, attn_mask, head_mask = inputs all_hidden_states = () all_attentions = () hidden_state = x for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training) hidden_state = layer_outputs[-1] if self.output_attentions: assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1 # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) outputs = (hidden_state,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class TFDistilBertMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.transformer = TFTransformer(config, name="transformer") # Encoder def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): raise NotImplementedError def call(self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.ones(input_shape) # (bs, seq_length) attention_mask = tf.cast(attention_mask, dtype=tf.float32) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class TFDistilBertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = DistilBertConfig base_model_prefix = "distilbert" DISTILBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ DISTILBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", DISTILBERT_START_DOCSTRING, ) class TFDistilBertModel(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertModel tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertModel.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.distilbert(inputs, **kwargs) return outputs class TFDistilBertLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.vocab_size = config.vocab_size self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.vocab_transform = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform" ) self.act = tf.keras.layers.Activation(gelu) self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") def get_output_embeddings(self): return self.vocab_projector.input_embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForMaskedLM tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ distilbert_output = self.distilbert(inputs, **kwargs) hidden_states = distilbert_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) outputs = (prediction_logits,) + distilbert_output[1:] return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.pre_classifier = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), activation="relu", name="pre_classifier", ) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForSequenceClassification tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ distilbert_output = self.distilbert(inputs, **kwargs) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim) outputs = (logits,) + distilbert_output[1:] return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.dropout = tf.keras.layers.Dropout(config.dropout) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForTokenClassification tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.distilbert(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) assert config.num_labels == 2 self.dropout = tf.keras.layers.Dropout(config.qa_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForQuestionAnswering tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] """ distilbert_output = self.distilbert(inputs, **kwargs) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False)) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + distilbert_output[1:] return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_electra.py ================================================ import logging import tensorflow as tf from transformers import ElectraConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_utils import get_initializer, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/electra-small-generator", "google/electra-base-generator", "google/electra-large-generator", "google/electra-small-discriminator", "google/electra-base-discriminator", "google/electra-large-discriminator", # See all ELECTRA models at https://huggingface.co/models?filter=electra ] class TFElectraEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.embedding_size = config.embedding_size self.initializer_range = config.initializer_range self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.embedding_size, embeddings_initializer=get_initializer(self.initializer_range), name="position_embeddings", ) self.token_type_embeddings = tf.keras.layers.Embedding( config.type_vocab_size, config.embedding_size, embeddings_initializer=get_initializer(self.initializer_range), name="token_type_embeddings", ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.embedding_size], initializer=get_initializer(self.initializer_range), ) super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, hidden_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.embedding_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.vocab_size]) class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.dense_prediction = tf.keras.layers.Dense(1, name="dense_prediction") self.config = config def call(self, discriminator_hidden_states, training=False): hidden_states = self.dense(discriminator_hidden_states) hidden_states = ACT2FN[self.config.hidden_act](hidden_states) logits = tf.squeeze(self.dense_prediction(hidden_states)) return logits class TFElectraGeneratorPredictions(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") def call(self, generator_hidden_states, training=False): hidden_states = self.dense(generator_hidden_states) hidden_states = ACT2FN["gelu"](hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class TFElectraPreTrainedModel(TFBertPreTrainedModel): config_class = ElectraConfig base_model_prefix = "electra" def get_extended_attention_mask(self, attention_mask, input_shape): if attention_mask is None: attention_mask = tf.fill(input_shape, 1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def get_head_mask(self, head_mask): if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.config.num_hidden_layers return head_mask class TFElectraMainLayer(TFElectraPreTrainedModel): config_class = ElectraConfig def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.embeddings = TFElectraEmbeddings(config, name="embeddings") if config.embedding_size != config.hidden_size: self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project") self.encoder = TFBertEncoder(config, name="encoder") self.config = config def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) head_mask = self.get_head_mask(head_mask) hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states, training=training) hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training) return hidden_states ELECTRA_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.ElectraConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ELECTRA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.ElectraTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to " "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the " "hidden size and embedding size are different." "" "Both the generator and discriminator checkpoints may be loaded into this model.", ELECTRA_START_DOCSTRING, ) class TFElectraModel(TFElectraPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.electra = TFElectraMainLayer(config, name="electra") def get_input_embeddings(self): return self.electra.embeddings @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraModel tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = TFElectraModel.from_pretrained('google/electra-small-discriminator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.electra(inputs, **kwargs) return outputs @add_start_docstrings( """ Electra model with a binary classification head on top as used during pre-training for identifying generated tokens. Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model of the two to have the correct classification head to be used for this model.""", ELECTRA_START_DOCSTRING, ) class TFElectraForPreTraining(TFElectraPreTrainedModel): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.electra = TFElectraMainLayer(config, name="electra") self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") def get_input_embeddings(self): return self.electra.embeddings @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Prediction scores of the head (scores for each token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraForPreTraining tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training ) discriminator_sequence_output = discriminator_hidden_states[0] logits = self.discriminator_predictions(discriminator_sequence_output) output = (logits,) output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) class TFElectraMaskedLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states, training=False): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """ Electra model with a language modeling head on top. Even though both the discriminator and generator may be loaded into this model, the generator is the only model of the two to have been trained for the masked language modeling task.""", ELECTRA_START_DOCSTRING, ) class TFElectraForMaskedLM(TFElectraPreTrainedModel): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.vocab_size = config.vocab_size self.electra = TFElectraMainLayer(config, name="electra") self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions") if isinstance(config.hidden_act, str): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") def get_input_embeddings(self): return self.electra.embeddings def get_output_embeddings(self): return self.generator_lm_head @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraForMaskedLM tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ generator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training ) generator_sequence_output = generator_hidden_states[0] prediction_scores = self.generator_predictions(generator_sequence_output, training=training) prediction_scores = self.generator_lm_head(prediction_scores, training=training) output = (prediction_scores,) output += generator_hidden_states[1:] return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a token classification head on top. Both the discriminator and generator may be loaded into this model.""", ELECTRA_START_DOCSTRING, ) class TFElectraForTokenClassification(TFElectraPreTrainedModel): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.electra = TFElectraMainLayer(config, name="electra") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraForTokenClassification tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training ) discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) output = (logits,) output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 Flaubert model. """ import logging import random import tensorflow as tf from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings from .modeling_tf_xlm import ( TFXLMForSequenceClassification, TFXLMMainLayer, TFXLMModel, TFXLMWithLMHeadModel, get_masks, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all Flaubert models at https://huggingface.co/models?filter=flaubert ] FLAUBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.FlaubertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ FLAUBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are languages ids which can be obtained from the language names by using two conversion mappings provided in the configuration of the model (only provided for multilingual models). More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). See usage examples detailed in the `multilingual documentation `__. token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`): dictionary with ``tf.Tensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", FLAUBERT_START_DOCSTRING, ) class TFFlaubertModel(TFXLMModel): config_class = FlaubertConfig def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") class TFFlaubertMainLayer(TFXLMMainLayer): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) def call( self, inputs, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, training=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask langs = inputs[2] if len(inputs) > 2 else langs token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids lengths = inputs[5] if len(inputs) > 5 else lengths cache = inputs[6] if len(inputs) > 6 else cache head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) langs = inputs.get("langs", langs) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) lengths = inputs.get("lengths", lengths) cache = inputs.get("cache", cache) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: bs, slen = shape_list(input_ids) elif inputs_embeds is not None: bs, slen = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if lengths is None: if input_ids is not None: lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) else: lengths = tf.convert_to_tensor([slen] * bs, tf.int32) # mask = input_ids != self.pad_index # check inputs # assert shape_list(lengths)[0] == bs tf.debugging.assert_equal(shape_list(lengths)[0], bs) # assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # position_ids if position_ids is None: position_ids = tf.expand_dims(tf.range(slen), axis=0) else: # assert shape_list(position_ids) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: # assert shape_list(langs) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(langs), [bs, slen]) # langs = langs.transpose(0, 1) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layers # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids) if langs is not None and self.use_lang_emb: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = self.dropout(tensor, training=training) tensor = tensor * mask[..., tf.newaxis] # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): # LayerDrop dropout_probability = random.uniform(0, 1) if training and (dropout_probability < self.layerdrop): continue if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention if not self.pre_norm: attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) else: tensor_normalized = self.layer_norm1[i](tensor) attn_outputs = self.attentions[i]( [tensor_normalized, attn_mask, None, cache, head_mask[i]], training=training ) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN if not self.pre_norm: tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) else: tensor_normalized = self.layer_norm2[i](tensor) tensor = tensor + self.ffns[i](tensor_normalized) tensor = tensor * mask[..., tf.newaxis] # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) @add_start_docstrings( """The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, FLAUBERT_START_DOCSTRING, ) class TFFlaubertWithLMHeadModel(TFXLMWithLMHeadModel): config_class = FlaubertConfig def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") @add_start_docstrings( """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, FLAUBERT_START_DOCSTRING, ) class TFFlaubertForSequenceClassification(TFXLMForSequenceClassification): config_class = FlaubertConfig def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 OpenAI GPT-2 model. """ import logging import numpy as np import tensorflow as tf from .configuration_gpt2 import GPT2Config from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( TFConv1D, TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, keras_serializable, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "distilgpt2", # See all GPT-2 models at https://huggingface.co/models?filter=gpt2 ] def gelu(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf class TFAttention(tf.keras.layers.Layer): def __init__(self, nx, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.n_ctx = n_ctx self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): pass @staticmethod def causal_attention_mask(nd, ns, dtype): """1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. """ i = tf.range(nd)[:, None] j = tf.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) def _attn(self, inputs, training=False): q, k, v, attention_mask, head_mask = inputs # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. _, _, nd, ns = shape_list(w) b = self.causal_attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w * b - 1e4 * (1 - b) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = tf.nn.softmax(w, axis=-1) w = self.attn_dropout(w, training=training) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [tf.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = tf.transpose(x, [0, 2, 1, 3]) x_shape = shape_list(x) new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]] return tf.reshape(x, new_x_shape) def split_heads(self, x): x_shape = shape_list(x) new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head] x = tf.reshape(x, new_x_shape) return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) def call(self, inputs, training=False): x, layer_past, attention_mask, head_mask, use_cache = inputs x = self.c_attn(x) query, key, value = tf.split(x, 3, axis=2) query = self.split_heads(query) key = self.split_heads(key) value = self.split_heads(value) if layer_past is not None: past_key, past_value = tf.unstack(layer_past, axis=0) key = tf.concat([past_key, key], axis=-2) value = tf.concat([past_value, value], axis=-2) # to cope with keras serialization # we need to cast `use_cache` to correct bool # if it is a tensor if tf.is_tensor(use_cache): if hasattr(use_cache, "numpy"): use_cache = bool(use_cache.numpy()) else: use_cache = True if use_cache is True: present = tf.stack([key, value], axis=0) else: present = (None,) attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a, training=training) outputs = [a, present] + attn_outputs[1:] return outputs # a, present, (attentions) class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) def call(self, x, training=False): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) h2 = self.dropout(h2, training=training) return h2 class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") self.mlp = TFMLP(4 * nx, config, name="mlp") def call(self, inputs, training=False): x, layer_past, attention_mask, head_mask, use_cache = inputs a = self.ln_1(x) output_attn = self.attn([a, layer_past, attention_mask, head_mask, use_cache], training=training) a = output_attn[0] # output_attn: a, present, (attentions) x = x + a m = self.ln_2(x) m = self.mlp(m, training=training) x = x + m outputs = [x] + output_attn[1:] return outputs # x, present, (attentions) @keras_serializable class TFGPT2MainLayer(tf.keras.layers.Layer): config_class = GPT2Config def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd self.wte = TFSharedEmbeddings( config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" ) self.wpe = tf.keras.layers.Embedding( config.n_positions, config.n_embd, embeddings_initializer=get_initializer(config.initializer_range), name="wpe", ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") def get_input_embeddings(self): return self.wte def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ raise NotImplementedError def call( self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past attention_mask = inputs[2] if len(inputs) > 2 else attention_mask token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds use_cache = inputs[7] if len(inputs) > 7 else use_cache assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = shape_list(past[0][0])[-2] if position_ids is None: position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = tf.cast(attention_mask, tf.float32) attention_mask = (1.0 - attention_mask) * -10000.0 else: attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: inputs_embeds = self.wte(input_ids, mode="embedding") position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_embeds = self.wte(token_type_ids, mode="embedding") else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] presents = () all_attentions = [] all_hidden_states = () for i, (block, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = block([hidden_states, layer_past, attention_mask, head_mask[i], use_cache], training=training) hidden_states, present = outputs[:2] presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.ln_f(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs # last hidden state, presents, (all hidden_states), (attentions) class TFGPT2PreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = GPT2Config base_model_prefix = "transformer" GPT2_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.GPT2Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ GPT2_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only `input_ids` that do not have their past calculated should be passed as `input_ids`. Indices can be obtained using :class:`transformers1.GPT2Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model should not be passed as `input_ids` as they have already been computed. attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.", GPT2_START_DOCSTRING, ) class TFGPT2Model(TFGPT2PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name="transformer") @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import GPT2Tokenizer, TFGPT2Model tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2Model.from_pretrained('gpt2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs @add_start_docstrings( """The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, ) class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name="transformer") def get_output_embeddings(self): return self.transformer.wte def prepare_inputs_for_generation(self, inputs, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: inputs = tf.expand_dims(inputs[:, -1], -1) return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import GPT2Tokenizer, TFGPT2LMHeadModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2LMHeadModel.from_pretrained('gpt2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_states = transformer_outputs[0] lm_logits = self.transformer.wte(hidden_states, mode="linear") outputs = (lm_logits,) + transformer_outputs[1:] return outputs # lm_logits, presents, (all hidden_states), (attentions) @add_start_docstrings( """The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, GPT2_START_DOCSTRING, ) class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) config.num_labels = 1 self.transformer = TFGPT2MainLayer(config, name="transformer") self.multiple_choice_head = TFSequenceSummary( config, initializer_range=config.initializer_range, name="multiple_choice_head" ) def get_output_embeddings(self): return self.transformer.wte @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def call( self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, use_cache=True, training=False, ): r""" mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as `input_ids` as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. import tensorflow as tf from transformers1 import GPT2Tokenizer, TFGPT2DoubleHeadsModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2') # Add a [CLS] to the vocabulary (we should train it also!) # This option is currently not implemented in TF 2.0 raise NotImplementedError tokenizer.add_special_tokens({'cls_token': '[CLS]'}) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] encoded_choices = [tokenizer.encode(s) for s in choices] cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2 mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past attention_mask = inputs[2] if len(inputs) > 2 else attention_mask token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids use_cache = inputs[8] if len(inputs) > 8 else use_cache assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") past = inputs.get("past", past) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs if input_ids is not None: input_shapes = shape_list(input_ids) else: input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, use_cache, ] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:]) lm_logits = self.transformer.wte(hidden_states, mode="linear") mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_openai.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 OpenAI GPT model.""" import logging import numpy as np import tensorflow as tf from .configuration_openai import OpenAIGPTConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( TFConv1D, TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "openai-gpt", # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt ] def gelu(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf def swish(x): return x * tf.math.sigmoid(x) ACT_FNS = { "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.layers.Activation(swish), } class TFAttention(tf.keras.layers.Layer): def __init__(self, nx, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.n_ctx = n_ctx self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): pass @staticmethod def causal_attention_mask(nd, ns, dtype): """1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. """ i = tf.range(nd)[:, None] j = tf.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) def _attn(self, inputs, training=False): q, k, v, attention_mask, head_mask = inputs # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. _, _, nd, ns = shape_list(w) b = self.causal_attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w * b - 1e4 * (1 - b) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = tf.nn.softmax(w, axis=-1) w = self.attn_dropout(w, training=training) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [tf.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = tf.transpose(x, [0, 2, 1, 3]) x_shape = shape_list(x) new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]] return tf.reshape(x, new_x_shape) def split_heads(self, x): x_shape = shape_list(x) new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head] x = tf.reshape(x, new_x_shape) return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) def call(self, inputs, training=False): x, attention_mask, head_mask = inputs x = self.c_attn(x) query, key, value = tf.split(x, 3, axis=2) query = self.split_heads(query) key = self.split_heads(key) value = self.split_heads(value) attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a, training=training) outputs = [a] + attn_outputs[1:] return outputs # a, (attentions) class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) def call(self, x, training=False): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) h2 = self.dropout(h2, training=training) return h2 class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.mlp = TFMLP(4 * nx, config, name="mlp") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") def call(self, inputs, training=False): x, attention_mask, head_mask = inputs output_attn = self.attn([x, attention_mask, head_mask], training=training) a = output_attn[0] # output_attn: a, (attentions) n = self.ln_1(x + a) m = self.mlp(n, training=training) h = self.ln_2(n + m) outputs = [h] + output_attn[1:] return outputs # x, (attentions) class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd self.tokens_embed = TFSharedEmbeddings( config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed" ) self.positions_embed = tf.keras.layers.Embedding( config.n_positions, config.n_embd, embeddings_initializer=get_initializer(config.initializer_range), name="positions_embed", ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] def get_input_embeddings(self): return self.tokens_embed def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if position_ids is None: position_ids = tf.range(input_shape[-1], dtype=tf.int32)[tf.newaxis, :] if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = tf.cast(attention_mask, tf.float32) attention_mask = (1.0 - attention_mask) * -10000.0 else: attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: inputs_embeds = self.tokens_embed(input_ids, mode="embedding") position_embeds = self.positions_embed(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding") else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] all_attentions = [] all_hidden_states = () for i, block in enumerate(self.h): if self.output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = block([hidden_states, attention_mask, head_mask[i]], training=training) hidden_states = outputs[0] if self.output_attentions: all_attentions.append(outputs[1]) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs # last hidden state, (all hidden_states), (attentions) class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = OpenAIGPTConfig base_model_prefix = "transformer" OPENAI_GPT_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.OpenAIGPTConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ OPENAI_GPT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.GPT2Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.", OPENAI_GPT_START_DOCSTRING, ) class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import OpenAIGPTTokenizer, TFOpenAIGPTModel tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTModel.from_pretrained('openai-gpt') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, ) class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") def get_output_embeddings(self): return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_states = transformer_outputs[0] lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") outputs = (lm_logits,) + transformer_outputs[1:] return outputs # lm_logits, (all hidden_states), (attentions) @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, OPENAI_GPT_START_DOCSTRING, ) class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) config.num_labels = 1 self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.multiple_choice_head = TFSequenceSummary( config, initializer_range=config.initializer_range, name="multiple_choice_head" ) def get_output_embeddings(self): return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False, ): r""" mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. import tensorflow as tf from transformers1 import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') # Add a [CLS] to the vocabulary (we should train it also!) # This option is currently not implemented in TF 2.0 raise NotImplementedError tokenizer.add_special_tokens({'cls_token': '[CLS]'}) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :] # Batch size 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs if input_ids is not None: input_shapes = shape_list(input_ids) else: input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, ] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:]) lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] return outputs # lm logits, mc logits, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_pytorch_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch - TF 2.0 general utilities.""" import logging import os import re import numpy logger = logging.getLogger(__name__) def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""): """ Convert a TF 2.0 model variable name in a pytorch model weight name. Conventions for TF2.0 scopes -> PyTorch attribute names conversions: - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) return tuple with: - pytorch model weight name - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other """ tf_name = tf_name.replace(":0", "") # device ids tf_name = re.sub( r"/[^/]*___([^/]*)/", r"/\1/", tf_name ) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) tf_name = tf_name.replace( "_._", "/" ) # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) tf_name = re.sub(r"//+", "/", tf_name) # Remove empty levels at the end tf_name = tf_name.split("/") # Convert from TF2.0 '/' separators to PyTorch '.' separators tf_name = tf_name[1:] # Remove level zero # When should we transpose the weights transpose = bool(tf_name[-1] == "kernel" or "emb_projs" in tf_name or "out_projs" in tf_name) # Convert standard TF2.0 names in PyTorch names if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma": tf_name[-1] = "weight" if tf_name[-1] == "beta": tf_name[-1] = "bias" # Remove prefix if needed tf_name = ".".join(tf_name) if start_prefix_to_remove: tf_name = tf_name.replace(start_prefix_to_remove, "", 1) return tf_name, transpose ##################### # PyTorch => TF 2.0 # ##################### def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False): """ Load pytorch checkpoints in a TF 2.0 model """ try: import tensorflow as tf # noqa: F401 import torch # noqa: F401 except ImportError: logger.error( "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise pt_path = os.path.abspath(pytorch_checkpoint_path) logger.info("Loading PyTorch weights from {}".format(pt_path)) pt_state_dict = torch.load(pt_path, map_location="cpu") logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values()))) return load_pytorch_weights_in_tf2_model( tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys ) def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False): """ Load pytorch checkpoints in a TF 2.0 model """ pt_state_dict = pt_model.state_dict() return load_pytorch_weights_in_tf2_model( tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys ) def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False): """ Load pytorch state_dict in a TF 2.0 model. """ try: import torch # noqa: F401 import tensorflow as tf # noqa: F401 from tensorflow.python.keras import backend as K except ImportError: logger.error( "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise if tf_inputs is None: tf_inputs = tf_model.dummy_inputs if tf_inputs is not None: tf_model(tf_inputs, training=False) # Make sure model is built # Adapt state dict - TODO remove this and update the AWS weights files instead # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] new_keys = [] for key in pt_state_dict.keys(): new_key = None if "gamma" in key: new_key = key.replace("gamma", "weight") if "beta" in key: new_key = key.replace("beta", "bias") if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): pt_state_dict[new_key] = pt_state_dict.pop(old_key) # Make sure we are able to load PyTorch base models as well as derived models (with heads) # TF models always have a prefix, some of PyTorch models (base ones) don't start_prefix_to_remove = "" if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()): start_prefix_to_remove = tf_model.base_model_prefix + "." symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights tf_loaded_numel = 0 weight_value_tuples = [] all_pytorch_weights = set(list(pt_state_dict.keys())) for symbolic_weight in symbolic_weights: sw_name = symbolic_weight.name name, transpose = convert_tf_weight_name_to_pt_weight_name( sw_name, start_prefix_to_remove=start_prefix_to_remove ) # Find associated numpy array in pytorch model state dict if name not in pt_state_dict: if allow_missing_keys: continue raise AttributeError("{} not found in PyTorch model".format(name)) array = pt_state_dict[name].numpy() if transpose: array = numpy.transpose(array) if len(symbolic_weight.shape) < len(array.shape): array = numpy.squeeze(array) elif len(symbolic_weight.shape) > len(array.shape): array = numpy.expand_dims(array, axis=0) try: assert list(symbolic_weight.shape) == list(array.shape) except AssertionError as e: e.args += (symbolic_weight.shape, array.shape) raise e tf_loaded_numel += array.size # logger.warning("Initialize TF weight {}".format(symbolic_weight.name)) weight_value_tuples.append((symbolic_weight, array)) all_pytorch_weights.discard(name) K.batch_set_value(weight_value_tuples) if tf_inputs is not None: tf_model(tf_inputs, training=False) # Make sure restore ops are run logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel)) logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights)) return tf_model ##################### # TF 2.0 => PyTorch # ##################### def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False): """ Load TF 2.0 HDF5 checkpoint in a PyTorch model We use HDF5 to easily do transfer learning (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357). """ try: import tensorflow as tf # noqa: F401 import torch # noqa: F401 except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise import transformers logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path)) # Instantiate and load the associated TF 2.0 model tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beggining tf_model_class = getattr(transformers, tf_model_class_name) tf_model = tf_model_class(pt_model.config) if tf_inputs is None: tf_inputs = tf_model.dummy_inputs if tf_inputs is not None: tf_model(tf_inputs, training=False) # Make sure model is built tf_model.load_weights(tf_checkpoint_path, by_name=True) return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys) def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False): """ Load TF 2.0 model in a pytorch model """ weights = tf_model.weights return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys) def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False): """ Load TF2.0 symbolic weights in a PyTorch model """ try: import tensorflow as tf # noqa: F401 import torch # noqa: F401 except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise new_pt_params_dict = {} current_pt_params_dict = dict(pt_model.named_parameters()) # Make sure we are able to load PyTorch base models as well as derived models (with heads) # TF models always have a prefix, some of PyTorch models (base ones) don't start_prefix_to_remove = "" if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()): start_prefix_to_remove = pt_model.base_model_prefix + "." # Build a map from potential PyTorch weight names to TF 2.0 Variables tf_weights_map = {} for tf_weight in tf_weights: pt_name, transpose = convert_tf_weight_name_to_pt_weight_name( tf_weight.name, start_prefix_to_remove=start_prefix_to_remove ) tf_weights_map[pt_name] = (tf_weight.numpy(), transpose) all_tf_weights = set(list(tf_weights_map.keys())) loaded_pt_weights_data_ptr = {} missing_keys_pt = [] for pt_weight_name, pt_weight in current_pt_params_dict.items(): # Handle PyTorch shared weight ()not duplicated in TF 2.0 if pt_weight.data_ptr() in loaded_pt_weights_data_ptr: new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()] continue # Find associated numpy array in pytorch model state dict if pt_weight_name not in tf_weights_map: if allow_missing_keys: missing_keys_pt.append(pt_weight_name) continue raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name)) array, transpose = tf_weights_map[pt_weight_name] if transpose: array = numpy.transpose(array) if len(pt_weight.shape) < len(array.shape): array = numpy.squeeze(array) elif len(pt_weight.shape) > len(array.shape): array = numpy.expand_dims(array, axis=0) try: assert list(pt_weight.shape) == list(array.shape) except AssertionError as e: e.args += (pt_weight.shape, array.shape) raise e # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name)) new_pt_params_dict[pt_weight_name] = torch.from_numpy(array) loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array) all_tf_weights.discard(pt_weight_name) missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False) missing_keys += missing_keys_pt if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from TF 2.0 model: {}".format(pt_model.__class__.__name__, missing_keys) ) if len(unexpected_keys) > 0: logger.info( "Weights from TF 2.0 model not used in {}: {}".format(pt_model.__class__.__name__, unexpected_keys) ) logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights)) return pt_model ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 RoBERTa model. """ import logging import tensorflow as tf from .configuration_roberta import RobertaConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list logger = logging.getLogger(__name__) TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "roberta-base", "roberta-large", "roberta-large-mnli", "distilroberta-base", # See all RoBERTa models at https://huggingface.co/models?filter=roberta ] class TFRobertaEmbeddings(TFBertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.padding_idx = 1 def create_position_ids_from_input_ids(self, x): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param tf.Tensor x: :return tf.Tensor: """ mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) incremental_indicies = tf.math.cumsum(mask, axis=1) * mask return incremental_indicies + self.padding_idx def create_position_ids_from_inputs_embeds(self, inputs_embeds): """ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. :param tf.Tensor inputs_embeds: :return tf.Tensor: """ seq_length = shape_list(inputs_embeds)[1] position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] return position_ids def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = self.create_position_ids_from_input_ids(input_ids) else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) class TFRobertaMainLayer(TFBertMainLayer): """ Same as TFBertMainLayer but uses TFRobertaEmbeddings. """ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.embeddings = TFRobertaEmbeddings(config, name="embeddings") def get_input_embeddings(self): return self.embeddings class TFRobertaPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = RobertaConfig base_model_prefix = "roberta" ROBERTA_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.RobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ROBERTA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.RobertaTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`__ position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", ROBERTA_START_DOCSTRING, ) class TFRobertaModel(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaModel tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaModel.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.roberta(inputs, **kwargs) return outputs class TFRobertaLMHead(tf.keras.layers.Layer): """Roberta Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = tf.keras.layers.Activation(gelu) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, features): x = self.dense(features) x = self.act(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x, mode="linear") + self.bias return x @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") def get_output_embeddings(self): return self.lm_head.decoder @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForMaskedLM tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForMaskedLM.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions) class TFRobertaClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x, training=training) x = self.dense(x) x = self.dropout(x, training=training) x = self.out_proj(x) return x @add_start_docstrings( """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ROBERTA_START_DOCSTRING, ) class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.classifier = TFRobertaClassificationHead(config, name="classifier") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForSequenceClassification tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForSequenceClassification.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.constant([1])[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] logits = self.classifier(sequence_output, training=kwargs.get("training", False)) outputs = (logits,) + outputs[2:] return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ROBERTA_START_DOCSTRING, ) class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForTokenClassification tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForTokenClassification.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions) @add_start_docstrings( """RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ROBERTA_START_DOCSTRING, ) class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint roberta-base is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForQuestionAnswering tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base') input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet") start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1 all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1]) """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_t5.py ================================================ # coding=utf-8 # Copyright 2018 T5 Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 T5 model. """ import copy import itertools import logging import math import tensorflow as tf from .configuration_t5 import T5Config from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list logger = logging.getLogger(__name__) TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", # See all T5 models at https://huggingface.co/models?filter=t5 ] #################################################### # TF 2.0 Models are constructed using Keras imperative API by sub-classing # - tf.keras.layers.Layer for the layers and # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model) #################################################### class TFT5LayerNorm(tf.keras.layers.Layer): def __init__(self, epsilon=1e-6, **kwargs): """ Construct a layernorm module in the T5 style No bias and no substraction of mean. """ super().__init__(**kwargs) self.variance_epsilon = epsilon def build(self, input_shape): """Build shared word embedding layer """ self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones") super().build(input_shape) def call(self, x): variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True) x = x * tf.math.rsqrt(variance + self.variance_epsilon) return self.weight * x class TFT5DenseReluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi") self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.act = tf.keras.activations.relu def call(self, hidden_states, training=False): h = self.wi(hidden_states) h = self.act(h) h = self.dropout(h, training=training) h = self.wo(h) return h class TFT5LayerFF(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense") self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call(self, hidden_states, training=False): norm_x = self.layer_norm(hidden_states) y = self.DenseReluDense(norm_x, training=training) layer_output = hidden_states + self.dropout(y, training=training) return layer_output class TFT5Attention(tf.keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.layer_id = next(TFT5Attention.NEW_ID) self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets self.d_model = config.d_model self.d_kv = config.d_kv self.n_heads = config.num_heads self.inner_dim = self.n_heads * self.d_kv # Mesh TensorFlow initialization to avoid scaling before softmax self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="q") self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="k") self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="v") self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name="o") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) if self.has_relative_attention_bias: self.relative_attention_bias = tf.keras.layers.Embedding( self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias", ) self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError @staticmethod def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 Translate relative position to a bucket number for relative attention. The relative position is defined as memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should allow for more graceful generalization to longer sequences than the model has been trained on. Args: relative_position: an int32 Tensor bidirectional: a boolean - whether the attention is bidirectional num_buckets: an integer max_distance: an integer Returns: a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) """ ret = 0 n = -relative_position if bidirectional: num_buckets //= 2 ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets n = tf.math.abs(n) else: n = tf.math.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = tf.math.less(n, max_exact) val_if_large = max_exact + tf.dtypes.cast( tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32, ) val_if_large = tf.math.minimum(val_if_large, num_buckets - 1) ret += tf.where(is_small, n, val_if_large) return ret def compute_bias(self, qlen, klen): """ Compute binned relative position bias """ context_position = tf.range(qlen)[:, None] memory_position = tf.range(klen)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) rp_bucket = self._relative_position_bucket( relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, ) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen) return values def call( self, input, mask=None, kv=None, position_bias=None, cache=None, past_key_value_state=None, head_mask=None, query_length=None, use_cache=False, training=False, ): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) # past_key_value_state[0] is (bs, n_heads, q_len - 1, dim_per_head) bs, qlen, dim = shape_list(input) if past_key_value_state is not None: assert self.is_decoder is True, "Encoder cannot cache past key value states" assert ( len(past_key_value_state) == 2 ), "past_key_value_state should have 2 past states: keys and values. Got {} past states".format( len(past_key_value_state) ) real_qlen = qlen + shape_list(past_key_value_state[0])[2] if query_length is None else query_length else: real_qlen = qlen if kv is None: klen = real_qlen else: klen = shape_list(kv)[1] def shape(x): """ projection """ return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3)) def unshape(x): """ compute context """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim)) q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) elif past_key_value_state is None: k = v = kv k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) if past_key_value_state is not None: if kv is None: k_, v_ = past_key_value_state k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) else: k, v = past_key_value_state # to cope with keras serialization # we need to cast `use_cache` to correct bool # if it is a tensor if tf.is_tensor(use_cache): if hasattr(use_cache, "numpy"): use_cache = bool(use_cache.numpy()) else: use_cache = True if self.is_decoder and use_cache is True: present_key_value_state = ((k, v),) else: present_key_value_state = (None,) scores = tf.einsum("bnqd,bnkd->bnqk", q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(real_qlen, klen) # if key and values are already calculated # we want only the last query position bias if past_key_value_state is not None: position_bias = position_bias[:, :, -1:, :] if mask is not None: position_bias = position_bias + mask # (bs, n_heads, qlen, klen) scores += position_bias weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) context = self.o(context) outputs = (context,) + present_key_value_state if self.output_attentions: outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) return outputs class TFT5LayerSelfAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.SelfAttention = TFT5Attention( config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention", ) self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call( self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, training=False, ): norm_x = self.layer_norm(hidden_states) attention_output = self.SelfAttention( norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, use_cache=use_cache, training=training, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y, training=training) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class TFT5LayerCrossAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.EncDecAttention = TFT5Attention( config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention", ) self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call( self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, query_length=None, use_cache=False, training=False, ): norm_x = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, query_length=query_length, use_cache=use_cache, training=training, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y, training=training) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class TFT5Block(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.is_decoder = config.is_decoder self.layer = [] self.layer.append( TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0",) ) if self.is_decoder: self.layer.append( TFT5LayerCrossAttention( config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1", ) ) self.layer.append(TFT5LayerFF(config, name="layer_._{}".format(len(self.layer)))) def call( self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, training=False, ): if past_key_value_state is not None: assert self.is_decoder, "Only decoder can use `past_key_value_states`" expected_num_past_key_value_states = 2 if encoder_hidden_states is None else 4 error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format( expected_num_past_key_value_states, "2 (past / key) for cross attention" if expected_num_past_key_value_states == 4 else "", len(past_key_value_state), ) assert len(past_key_value_state) == expected_num_past_key_value_states, error_message self_attn_past_key_value_state = past_key_value_state[:2] cross_attn_past_key_value_state = past_key_value_state[2:] else: self_attn_past_key_value_state, cross_attn_past_key_value_state = None, None self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=self_attn_past_key_value_state, use_cache=use_cache, training=training, ) hidden_states, present_key_value_state = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights if self.is_decoder and encoder_hidden_states is not None: # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: query_length = shape_list(present_key_value_state[0])[2] else: query_length = None cross_attention_outputs = self.layer[1]( hidden_states, kv=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, head_mask=head_mask, past_key_value_state=cross_attn_past_key_value_state, query_length=query_length, use_cache=use_cache, training=training, ) hidden_states = cross_attention_outputs[0] # Combine self attn and cross attn key value states if present_key_value_state is not None: present_key_value_state = present_key_value_state + cross_attention_outputs[1] # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states, training=training) outputs = (hidden_states,) # Add attentions if we output them outputs = outputs + (present_key_value_state,) + attention_outputs return outputs # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) class _NoLayerEmbedTokens(object): """ this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer' class to avoid problem with weight restoring. Also it makes sure that the layer is called from the correct scope to avoid problem with saving/storing the correct weights """ def __init__(self, layer, abs_scope_name=None): self._layer = layer self._abs_scope_name = abs_scope_name def call(self, inputs, mode="embedding"): if self._abs_scope_name is None: return self._layer.call(inputs, mode) # if an abs scope name is given to the embedding variable, call variable from absolute scope with tf.compat.v1.variable_scope(self._abs_scope_name, auxiliary_name_scope=False) as abs_scope_name: with tf.name_scope(abs_scope_name.original_name_scope): return self._layer.call(inputs, mode) def __call__(self, inputs, mode="embedding"): if self._abs_scope_name is None: return self._layer(inputs, mode) # if an abs scope name is given to the embedding variable, call variable from absolute scope with tf.compat.v1.variable_scope(self._abs_scope_name, auxiliary_name_scope=False) as abs_scope_name: with tf.name_scope(abs_scope_name.original_name_scope): return self._layer(inputs, mode) #################################################### # The full model without a specific pretrained or finetuning head is # provided as a tf.keras.layers.Layer usually called "TFT5MainLayer" #################################################### class TFT5MainLayer(tf.keras.layers.Layer): def __init__(self, config, embed_tokens=None, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embed_tokens = embed_tokens self.is_decoder = config.is_decoder self.config = config self.num_hidden_layers = config.num_layers self.block = [ TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i),) for i in range(config.num_layers) ] self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def get_input_embeddings(self): return self.embed_tokens def get_output_embeddings(self): return self.embed_tokens def set_embed_tokens(self, embed_tokens): self.embed_tokens = embed_tokens def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models def call( self, inputs, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, inputs_embeds=None, head_mask=None, past_key_value_states=None, use_cache=False, training=False, ): if inputs is not None and inputs_embeds is not None: raise ValueError("You cannot specify both inputs and inputs_embeds at the same time") elif inputs is not None: input_shape = shape_list(inputs) inputs = tf.reshape(inputs, (-1, input_shape[-1])) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either inputs or inputs_embeds") if inputs_embeds is None: assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" inputs_embeds = self.embed_tokens(inputs) batch_size, seq_length = input_shape if past_key_value_states is not None: assert seq_length == 1, "Input shape is {}, but should be {} when using past_key_value_sates".format( input_shape, (batch_size, 1) ) # required mask seq length can be calculated via length of past # key value states and seq_length = 1 for the last token mask_seq_length = shape_list(past_key_value_states[0][0])[2] + seq_length else: mask_seq_length = seq_length if attention_mask is None: attention_mask = tf.fill((batch_size, mask_seq_length), 1) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: encoder_seq_length = shape_list(encoder_hidden_states)[1] encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1) # initialize past_key_value_states with `None` if past does not exist if past_key_value_states is None: past_key_value_states = [None] * len(self.block) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. attention_mask = tf.cast(attention_mask, dtype=tf.float32) num_dims_attention_mask = len(shape_list(attention_mask)) if num_dims_attention_mask == 3: extended_attention_mask = attention_mask[:, None, :, :] elif num_dims_attention_mask == 2: # Provided a padding mask of dimensions [batch_size, mask_seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] if self.is_decoder: seq_ids = tf.range(mask_seq_length) causal_mask = tf.less_equal( tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), seq_ids[None, :, None], ) causal_mask = tf.cast(causal_mask, dtype=tf.float32) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] if past_key_value_states[0] is not None: extended_attention_mask = extended_attention_mask[:, :, -1:, :] else: extended_attention_mask = attention_mask[:, None, None, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 # extended_attention_mask = tf.math.equal(extended_attention_mask, # tf.transpose(extended_attention_mask, perm=(-1, -2))) extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 if self.is_decoder and encoder_attention_mask is not None: # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, mask_seq_length, mask_seq_length] # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32) num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) if num_dims_encoder_attention_mask == 3: encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if num_dims_encoder_attention_mask == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) present_key_value_states = () all_hidden_states = () all_attentions = () position_bias = None encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds, training=training) for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask=extended_attention_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i], past_key_value_state=past_key_value_state, use_cache=use_cache, training=training, ) # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states, present_key_value_state = layer_outputs[:2] if i == 0: # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) position_bias = layer_outputs[3 if self.output_attentions else 2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[5 if self.output_attentions else 3] # append next layer key value states present_key_value_states = present_key_value_states + (present_key_value_state,) if self.output_attentions: all_attentions = all_attentions + (layer_outputs[2],) hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states, training=training) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) outputs = outputs + (present_key_value_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) #################################################### # TFT5PreTrainedModel is a sub-class of tf.keras.Model # which take care of loading and saving pretrained weights # and various common utilities. # Here you just need to specify a few (self-explanatory) # pointers for your model. #################################################### class TFT5PreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = T5Config base_model_prefix = "transformer" @property def dummy_inputs(self): inputs = tf.constant(DUMMY_INPUTS) input_mask = tf.constant(DUMMY_MASK) dummy_inputs = { "inputs": inputs, "decoder_input_ids": inputs, "decoder_attention_mask": input_mask, } return dummy_inputs T5_START_DOCSTRING = r""" The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting. This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`: https://arxiv.org/abs/1910.10683 .. _`tf.keras.Model`: https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model Note on the model inputs: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with inputs only and nothing else: `model(inputs_ids) - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: `model([inputs, attention_mask])` or `model([inputs, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associaed to the input names given in the docstring: `model({'inputs': inputs, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.T5Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ T5_INPUTS_DOCSTRING = r""" Args: inputs are usually used as a `dict` (see T5 description above for more information) containing all the following. inputs (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you should be able to pad the inputs on the right or the left. Indices can be obtained using :class:`transformers1.T5Tokenizer`. To know more on how to prepare :obj:`inputs` for pre-training take a look at `T5 Training <./t5.html#training>`_ . See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation. If `decoder_past_key_value_states` is used, optionally only the last `decoder_input_ids` have to be input (see `decoder_past_key_value_states`). attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up decoding. If `decoder_past_key_value_states` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): If `use_cache` is True, `decoder_past_key_value_states` are returned and can be used to speed up decoding (see `decoder_past_key_value_states`). inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`inputs` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `inputs` indices into associated vectors than the model's internal embedding lookup matrix. decoder_inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at `T5 Training <./t5.html#training>`_ . head_mask: (:obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ @add_start_docstrings( "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.", T5_START_DOCSTRING, ) class TFT5Model(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") # retrieve correct absolute scope for embed token wrapper with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name: pass embed_tokens = _NoLayerEmbedTokens(self.shared, abs_scope_name=shared_abs_scope_name) encoder_config = copy.deepcopy(config) self.encoder = TFT5MainLayer(encoder_config, embed_tokens, name="encoder") decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = TFT5MainLayer(decoder_config, embed_tokens, name="decoder") def get_input_embeddings(self): return self.shared def get_output_embeddings(self): return self.shared def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import T5Tokenizer, TFT5Model tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5Model.from_pretrained('t5-small') inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 outputs = model(inputs, decoder_input_ids=inputs) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if isinstance(inputs, dict): kwargs.update(inputs) else: kwargs["inputs"] = inputs # retrieve arguments inputs = kwargs.get("inputs", None) inputs_embeds = kwargs.get("inputs_embeds", None) attention_mask = kwargs.get("attention_mask", None) encoder_outputs = kwargs.get("encoder_outputs", None) decoder_input_ids = kwargs.get("decoder_input_ids", None) decoder_attention_mask = kwargs.get("decoder_attention_mask", None) decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None) decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None) use_cache = kwargs.get("use_cache", True) head_mask = kwargs.get("head_mask", None) # Encode if needed (training, first prediction pass) if encoder_outputs is None: encoder_outputs = self.encoder( inputs, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, ) hidden_states = encoder_outputs[0] # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] return decoder_outputs + encoder_outputs @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) class TFT5ForConditionalGeneration(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.model_dim = config.d_model self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") # retrieve correct absolute scope for embed token wrapper with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name: pass embed_tokens = _NoLayerEmbedTokens(self.shared, abs_scope_name=shared_abs_scope_name) encoder_config = copy.deepcopy(config) self.encoder = TFT5MainLayer(encoder_config, embed_tokens, name="encoder") decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = TFT5MainLayer(decoder_config, embed_tokens, name="decoder") def get_input_embeddings(self): return self.shared def get_output_embeddings(self): return self.shared def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): Classification loss (cross entropy). prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention. Examples:: from transformers1 import T5Tokenizer, TFT5ForConditionalGeneration tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5ForConditionalGeneration.from_pretrained('t5-small') inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 outputs = model(inputs, decoder_input_ids=inputs) prediction_scores = outputs[0] tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5ForConditionalGeneration.from_pretrained('t5-small') inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1 model.generate(inputs) """ if isinstance(inputs, dict): kwargs.update(inputs) else: kwargs["inputs"] = inputs # retrieve arguments inputs = kwargs.get("inputs", None) decoder_input_ids = kwargs.get("decoder_input_ids", None) attention_mask = kwargs.get("attention_mask", None) encoder_outputs = kwargs.get("encoder_outputs", None) decoder_attention_mask = kwargs.get("decoder_attention_mask", None) decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None) use_cache = kwargs.get("use_cache", True) inputs_embeds = kwargs.get("inputs_embeds", None) decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None) head_mask = kwargs.get("head_mask", None) # Encode if needed (training, first prediction pass) if encoder_outputs is None: # Convert encoder inputs in embeddings if needed encoder_outputs = self.encoder( inputs, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, ) hidden_states = encoder_outputs[0] # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) # insert decoder past at right place # to speed up decoding if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) embed_tokens = self.get_output_embeddings() lm_logits = embed_tokens(sequence_output, mode="linear") decoder_outputs = (lm_logits,) + decoder_outputs[1:] return decoder_outputs + encoder_outputs def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step if len(past) < 2: encoder_outputs, decoder_past_key_value_states = past, None else: encoder_outputs, decoder_past_key_value_states = past[0], past[1] return { "inputs": None, # inputs don't have to be defined, but still need to be passed to make Keras.layer.__call__ happy "decoder_input_ids": inputs, # inputs are the decoder_input_ids "decoder_past_key_value_states": decoder_past_key_value_states, "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "use_cache": use_cache, } def _reorder_cache(self, past, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder if len(past) < 2: logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") return past decoder_past = past[1] past = (past[0],) reordered_decoder_past = () for layer_past_states in decoder_past: # get the correct batch idx from layer past batch dim # batch dim of `past` is at 2nd position reordered_layer_past_states = () for layer_past_state in layer_past_states: # need to set correct `past` for each of the four key / value states reordered_layer_past_states = reordered_layer_past_states + (tf.gather(layer_past_state, beam_idx),) assert shape_list(reordered_layer_past_states[0]) == shape_list(layer_past_states[0]) assert len(reordered_layer_past_states) == len(layer_past_states) reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return past + (reordered_decoder_past,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 Transformer XL model. """ import logging import tensorflow as tf from .configuration_transfo_xl import TransfoXLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "transfo-xl-wt103", # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl ] class TFPositionalEmbedding(tf.keras.layers.Layer): def __init__(self, demb, **kwargs): super().__init__(**kwargs) self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb)) def call(self, pos_seq, bsz=None): sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq) pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1) if bsz is not None: return tf.tile(pos_emb[:, None, :], [1, bsz, 1]) else: return pos_emb[:, None, :] class TFPositionwiseFF(tf.keras.layers.Layer): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): super().__init__(**kwargs) self.d_model = d_model self.d_inner = d_inner self.dropout = dropout self.layer_1 = tf.keras.layers.Dense( d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0" ) self.drop_1 = tf.keras.layers.Dropout(dropout) self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3") self.drop_2 = tf.keras.layers.Dropout(dropout) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.pre_lnorm = pre_lnorm def call(self, inp, training=False): if self.pre_lnorm: # layer normalization + positionwise feed-forward core_out = self.layer_norm(inp) core_out = self.layer_1(core_out) core_out = self.drop_1(core_out, training=training) core_out = self.layer_2(core_out) core_out = self.drop_2(core_out, training=training) # residual connection output = core_out + inp else: # positionwise feed-forward core_out = self.layer_1(inp) core_out = self.drop_1(core_out, training=training) core_out = self.layer_2(core_out) core_out = self.drop_2(core_out, training=training) # residual connection + layer normalization output = self.layer_norm(inp + core_out) return output class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): def __init__( self, n_head, d_model, d_head, dropout, dropatt=0, tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs ): super().__init__(**kwargs) self.output_attentions = output_attentions self.n_head = n_head self.d_model = d_model self.d_head = d_head self.dropout = dropout self.qkv_net = tf.keras.layers.Dense( 3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net" ) self.drop = tf.keras.layers.Dropout(dropout) self.dropatt = tf.keras.layers.Dropout(dropatt) self.o_net = tf.keras.layers.Dense( d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net" ) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.scale = 1 / (d_head ** 0.5) self.pre_lnorm = pre_lnorm if r_r_bias is not None and r_w_bias is not None: # Biases are shared self.r_r_bias = r_r_bias self.r_w_bias = r_w_bias else: self.r_r_bias = None self.r_w_bias = None self.r_net = tf.keras.layers.Dense( self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net" ) def build(self, input_shape): if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared self.r_r_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" ) self.r_w_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" ) super().build(input_shape) def _rel_shift(self, x): x_size = shape_list(x) x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]]) x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]]) x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1]) x = tf.reshape(x, x_size) return x def call(self, inputs, training=False): w, r, attn_mask, mems, head_mask = inputs qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1] if mems is not None: cat = tf.concat([mems, w], 0) if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(cat)) else: w_heads = self.qkv_net(cat) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) w_head_q = w_head_q[-qlen:] else: if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(w)) else: w_heads = self.qkv_net(w) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) klen = shape_list(w_head_k)[0] w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head # compute attention score rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k) # qlen x klen x bsz x n_head rr_head_q = w_head_q + self.r_r_bias BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k) # qlen x klen x bsz x n_head BD = self._rel_shift(BD) # [qlen x klen x bsz x n_head] attn_score = AC + BD attn_score = attn_score * self.scale # compute attention probability if attn_mask is not None: attn_mask_t = attn_mask[:, :, None, None] attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t # [qlen x klen x bsz x n_head] attn_prob = tf.nn.softmax(attn_score, axis=1) attn_prob = self.dropatt(attn_prob, training=training) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * head_mask # compute attention vector attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec_sizes = shape_list(attn_vec) attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head)) # linear projection attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out, training=training) if self.pre_lnorm: # residual connection outputs = [w + attn_out] else: # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] if self.output_attentions: outputs.append(attn_prob) return outputs class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): def __init__( self, n_head, d_model, d_head, d_inner, dropout, tgt_len=None, ext_len=None, mem_len=None, dropatt=0.0, pre_lnorm=False, r_w_bias=None, r_r_bias=None, output_attentions=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs ): super().__init__(**kwargs) self.dec_attn = TFRelPartialLearnableMultiHeadAttn( n_head, d_model, d_head, dropout, tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm, r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std, output_attentions=output_attentions, layer_norm_epsilon=layer_norm_epsilon, name="dec_attn", ) self.pos_ff = TFPositionwiseFF( d_model, d_inner, dropout, pre_lnorm=pre_lnorm, init_std=init_std, layer_norm_epsilon=layer_norm_epsilon, name="pos_ff", ) def call(self, inputs, training=False): dec_inp, r, dec_attn_mask, mems, head_mask = inputs attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask, mems, head_mask], training=training) ff_output = self.pos_ff(attn_outputs[0], training=training) outputs = [ff_output] + attn_outputs[1:] return outputs class TFAdaptiveEmbedding(tf.keras.layers.Layer): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): super().__init__(**kwargs) self.n_token = n_token self.d_embed = d_embed self.init_std = init_std self.cutoffs = cutoffs + [n_token] self.div_val = div_val self.d_proj = d_proj self.emb_scale = d_proj ** 0.5 self.cutoff_ends = [0] + self.cutoffs self.emb_layers = [] self.emb_projs = [] if div_val == 1: raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) self.emb_layers.append( tf.keras.layers.Embedding( r_idx - l_idx, d_emb_i, embeddings_initializer=get_initializer(init_std), name="emb_layers_._{}".format(i), ) ) def build(self, input_shape): for i in range(len(self.cutoffs)): d_emb_i = self.d_embed // (self.div_val ** i) self.emb_projs.append( self.add_weight( shape=(d_emb_i, self.d_proj), initializer=get_initializer(self.init_std), trainable=True, name="emb_projs_._{}".format(i), ) ) super().build(input_shape) def call(self, inp): if self.div_val == 1: raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint else: inp_flat = tf.reshape(inp, (-1,)) emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj]) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx emb_i = self.emb_layers[i](inp_i) emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i]) mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64) emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64)) embed_shape = shape_list(inp) + [self.d_proj] embed = tf.reshape(emb_flat, embed_shape) embed *= self.emb_scale return embed @keras_serializable class TFTransfoXLMainLayer(tf.keras.layers.Layer): config_class = TransfoXLConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.n_token = config.vocab_size self.d_embed = config.d_embed self.d_model = config.d_model self.n_head = config.n_head self.d_head = config.d_head self.untie_r = config.untie_r self.word_emb = TFAdaptiveEmbedding( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, init_std=config.init_std, name="word_emb", ) self.drop = tf.keras.layers.Dropout(config.dropout) self.n_layer = config.n_layer self.tgt_len = config.tgt_len self.mem_len = config.mem_len self.ext_len = config.ext_len self.max_klen = config.tgt_len + config.ext_len + config.mem_len self.attn_type = config.attn_type self.layers = [] if config.attn_type == 0: # the default attention for i in range(config.n_layer): self.layers.append( TFRelPartialLearnableDecoderLayer( config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, r_w_bias=None if self.untie_r else self.r_w_bias, r_r_bias=None if self.untie_r else self.r_r_bias, output_attentions=self.output_attentions, layer_norm_epsilon=config.layer_norm_epsilon, init_std=config.init_std, name="layers_._{}".format(i), ) ) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint self.same_length = config.same_length self.clamp_len = config.clamp_len if self.attn_type == 0: # default attention self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb") else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint def build(self, input_shape): if not self.untie_r: self.r_w_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" ) self.r_r_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" ) super().build(input_shape) def get_input_embeddings(self): return self.word_emb def _resize_token_embeddings(self, new_num_tokens): return self.word_emb def backward_compatible(self): self.sample_softmax = -1 def reset_length(self, tgt_len, ext_len, mem_len): self.tgt_len = tgt_len self.mem_len = mem_len self.ext_len = ext_len def _prune_heads(self, heads): raise NotImplementedError def init_mems(self, bsz): if self.mem_len > 0: mems = [] for i in range(self.n_layer): empty = tf.zeros([self.mem_len, bsz, self.d_model]) mems.append(empty) return mems else: return None def _update_mems(self, hids, mems, mlen, qlen): # does not deal with None if mems is None: return None # mems is not None assert len(hids) == len(mems), "len(hids) != len(mems)" # There are `mlen + qlen` steps that can be cached into mems # For the next step, the last `ext_len` of the `qlen` tokens # will be used as the extended context. Hence, we only cache # the tokens from `mlen + qlen - self.ext_len - self.mem_len` # to `mlen + qlen - self.ext_len`. new_mems = [] end_idx = mlen + max(0, qlen - 0 - self.ext_len) beg_idx = max(0, end_idx - self.mem_len) for i in range(len(hids)): cat = tf.concat([mems[i], hids[i]], axis=0) tf.stop_gradient(cat) new_mems.append(cat[beg_idx:end_idx]) return new_mems def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = tf.transpose(input_ids, perm=(1, 0)) qlen, bsz = shape_list(input_ids) elif inputs_embeds is not None: inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2)) qlen, bsz = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if mems is None: mems = self.init_mems(bsz) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layer if inputs_embeds is not None: word_emb = inputs_embeds else: word_emb = self.word_emb(input_ids) mlen = shape_list(mems[0])[0] if mems is not None else 0 klen = mlen + qlen attn_mask = tf.ones([qlen, qlen]) mask_u = tf.linalg.band_part(attn_mask, 0, -1) mask_dia = tf.linalg.band_part(attn_mask, 0, 0) attn_mask_pad = tf.zeros([qlen, mlen]) dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) if self.same_length: mask_l = tf.linalg.band_part(attn_mask, -1, 0) dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, dec_attn_mask[:, qlen:]], 1) # ::: PyTorch masking code for reference ::: # if self.same_length: # all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) # mask_len = klen - self.mem_len # if mask_len > 0: # mask_shift_len = qlen - mask_len # else: # mask_shift_len = qlen # dec_attn_mask = (torch.triu(all_ones, 1+mlen) # + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 # else: # dec_attn_mask = torch.triu( # word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] hids = [] attentions = [] if self.attn_type == 0: # default pos_seq = tf.range(klen - 1, -1, -1.0) if self.clamp_len > 0: pos_seq = tf.minimum(pos_seq, self.clamp_len) pos_emb = self.pos_emb(pos_seq) core_out = self.drop(word_emb, training=training) pos_emb = self.drop(pos_emb, training=training) for i, layer in enumerate(self.layers): hids.append(core_out) mems_i = None if mems is None else mems[i] layer_outputs = layer([core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i]], training=training) core_out = layer_outputs[0] if self.output_attentions: attentions.append(layer_outputs[1]) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint core_out = self.drop(core_out, training=training) new_mems = self._update_mems(hids, mems, mlen, qlen) # We transpose back here to shape [bsz, len, hidden_dim] outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems] if self.output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids) outputs.append(hids) if self.output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) outputs.append(attentions) return outputs # last hidden state, new_mems, (all hidden states), (all attentions) class TFTransfoXLPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = TransfoXLConfig base_model_prefix = "transformer" TRANSFO_XL_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.TransfoXLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ TRANSFO_XL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.TransfoXLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", TRANSFO_XL_START_DOCSTRING, ) class TFTransfoXLModel(TFTransfoXLPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFTransfoXLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import TransfoXLTokenizer, TFTransfoXLModel tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states, mems = outputs[:2] """ outputs = self.transformer(inputs, **kwargs) return outputs class TFTransfoXLLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive input embeddings)""", TRANSFO_XL_START_DOCSTRING, ) class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.sample_softmax = config.sample_softmax assert ( self.sample_softmax <= 0 ), "Sampling from the softmax is not implemented yet. Please look at issue: #3310: https://github.com/huggingface/transformers/issues/3310" self.crit = TFAdaptiveSoftmaxMask( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit" ) def get_output_embeddings(self): """ Double-check if you are using adaptive softmax. """ if len(self.crit.out_layers) > 0: return self.crit.out_layers[-1] return None def reset_length(self, tgt_len, ext_len, mem_len): self.transformer.reset_length(tgt_len, ext_len, mem_len) def init_mems(self, bsz): return self.transformer.init_mems(bsz) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, labels=None, training=False): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import TransfoXLTokenizer, TFTransfoXLLMHeadModel tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, mems = outputs[:2] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds labels = inputs[4] if len(inputs) > 4 else labels assert len(inputs) <= 5, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) labels = inputs.get("labels", labels) assert len(inputs) <= 5, "Too many inputs." else: input_ids = inputs if input_ids is not None: bsz, tgt_len = shape_list(input_ids)[:2] else: bsz, tgt_len = shape_list(inputs_embeds)[:2] transformer_outputs = self.transformer([input_ids, mems, head_mask, inputs_embeds], training=training) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] outputs = transformer_outputs[1:] softmax_output = self.crit([pred_hid, labels], training=training) outputs = [softmax_output] + outputs return outputs # logits, new_mems, (all hidden states), (all attentions) def prepare_inputs_for_generation(self, inputs, past, **model_kwargs): inputs = {"inputs": inputs} # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_transfo_xl_utilities.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ A TF 2.0 Adaptive Softmax for Transformer XL model. """ import tensorflow as tf from .modeling_tf_utils import shape_list class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): super().__init__(**kwargs) self.vocab_size = vocab_size self.d_embed = d_embed self.d_proj = d_proj self.cutoffs = cutoffs + [vocab_size] self.cutoff_ends = [0] + self.cutoffs self.div_val = div_val self.shortlist_size = self.cutoffs[0] self.n_clusters = len(self.cutoffs) - 1 self.head_size = self.shortlist_size + self.n_clusters self.keep_order = keep_order self.out_layers = [] self.out_projs = [] def build(self, input_shape): if self.n_clusters > 0: self.cluster_weight = self.add_weight( shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight" ) self.cluster_bias = self.add_weight( shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias" ) if self.div_val == 1: for i in range(len(self.cutoffs)): if self.d_proj != self.d_embed: weight = self.add_weight( shape=(self.d_embed, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i), ) self.out_projs.append(weight) else: self.out_projs.append(None) weight = self.add_weight( shape=(self.vocab_size, self.d_embed,), initializer="zeros", trainable=True, name="out_layers_._{}_._weight".format(i), ) bias = self.add_weight( shape=(self.vocab_size,), initializer="zeros", trainable=True, name="out_layers_._{}_._bias".format(i), ) self.out_layers.append((weight, bias)) else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = self.d_embed // (self.div_val ** i) weight = self.add_weight( shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i) ) self.out_projs.append(weight) weight = self.add_weight( shape=(r_idx - l_idx, d_emb_i,), initializer="zeros", trainable=True, name="out_layers_._{}_._weight".format(i), ) bias = self.add_weight( shape=(r_idx - l_idx,), initializer="zeros", trainable=True, name="out_layers_._{}_._bias".format(i), ) self.out_layers.append((weight, bias)) super().build(input_shape) @staticmethod def _logit(x, W, b, proj=None): y = x if proj is not None: y = tf.einsum("ibd,ed->ibe", y, proj) return tf.einsum("ibd,nd->ibn", y, W) + b @staticmethod def _gather_logprob(logprob, target): lp_size = shape_list(logprob) r = tf.range(lp_size[0]) idx = tf.stack([r, target], 1) return tf.gather_nd(logprob, idx) def call(self, inputs, return_mean=True, training=False): hidden, target = inputs head_logprob = 0 if self.n_clusters == 0: output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) if target is not None: loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) out = tf.nn.log_softmax(output, axis=-1) else: hidden_sizes = shape_list(hidden) out = [] loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] if target is not None: mask = (target >= l_idx) & (target < r_idx) mask_idx = tf.where(mask) cur_target = tf.boolean_mask(target, mask) - l_idx if self.div_val == 1: cur_W = self.out_layers[0][0][l_idx:r_idx] cur_b = self.out_layers[0][1][l_idx:r_idx] else: cur_W = self.out_layers[i][0] cur_b = self.out_layers[i][1] if i == 0: cur_W = tf.concat([cur_W, self.cluster_weight], 0) cur_b = tf.concat([cur_b, self.cluster_bias], 0) head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0]) head_logprob = tf.nn.log_softmax(head_logit) out.append(head_logprob[..., : self.cutoffs[0]]) if target is not None: cur_head_logprob = tf.boolean_mask(head_logprob, mask) cur_logprob = self._gather_logprob(cur_head_logprob, cur_target) else: tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i]) tail_logprob = tf.nn.log_softmax(tail_logit) cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob out.append(logprob_i) if target is not None: cur_head_logprob = tf.boolean_mask(head_logprob, mask) cur_tail_logprob = tf.boolean_mask(tail_logprob, mask) cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target) cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1] if target is not None: loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64)) out = tf.concat(out, axis=-1) if target is not None: if return_mean: loss = tf.reduce_mean(loss) # Add the training-time loss value to the layer using `self.add_loss()`. self.add_loss(loss) # Log the loss as a metric (we could log arbitrary metrics, # including different metrics for training and inference. self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "") return out ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """TF general model utils.""" import functools import logging import os import h5py import numpy as np import tensorflow as tf from tensorflow.python.keras.saving import hdf5_format from .configuration_utils import PretrainedConfig from .file_utils import DUMMY_INPUTS, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model logger = logging.getLogger(__name__) class TFModelUtilsMixin: """ A few utilities for `tf.keras.Model`s, to be used as a mixin. """ def num_parameters(self, only_trainable: bool = False) -> int: """ Get number of (optionally, trainable) parameters in the model. """ if only_trainable: return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables)) else: return self.count_params() def keras_serializable(cls): """ Decorate a Keras Layer class to support Keras serialization. This is done by: 1. adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at serialization time 2. wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and convert it to a config object for the actual layer initializer 3. registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model` :param cls: a tf.keras.layers.Layers subclass that accepts a `config` argument to its initializer (typically a `TF*MainLayer` class in this project) :return: the same class object, with modifications for Keras deserialization. """ initializer = cls.__init__ config_class = getattr(cls, "config_class", None) if config_class is None: raise AttributeError("Must set `config_class` to use @keras_serializable") @functools.wraps(initializer) def wrapped_init(self, *args, **kwargs): transformers_config = kwargs.pop("transformers_config", None) config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.get("config", None) if config is not None and transformers_config is not None: raise ValueError("Must pass either `config` or `transformers_config`, not both") elif config is not None: # normal layer construction, call with unchanged args (config is already in there) initializer(self, *args, **kwargs) elif transformers_config is not None: # Keras deserialization, convert dict to config config = config_class.from_dict(transformers_config) initializer(self, config, *args, **kwargs) else: raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)") self._transformers_config = config cls.__init__ = wrapped_init if not hasattr(cls, "get_config"): raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses") if hasattr(cls.get_config, "_is_default"): def get_config(self): cfg = super(cls, self).get_config() cfg["transformers_config"] = self._transformers_config.to_dict() return cfg cls.get_config = get_config cls._keras_serializable = True if hasattr(tf.keras.utils, "register_keras_serializable"): cls = tf.keras.utils.register_keras_serializable()(cls) return cls class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): r""" Base class for all TF models. :class:`~transformers1.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. Class attributes (overridden by derived classes): - ``config_class``: a class derived from :class:`~transformers1.PretrainedConfig` to use as configuration class for this model architecture. - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: - ``model``: an instance of the relevant subclass of :class:`~transformers1.PreTrainedModel`, - ``config``: an instance of the relevant subclass of :class:`~transformers1.PretrainedConfig`, - ``path``: a path (string) to the TensorFlow checkpoint. - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. """ config_class = None base_model_prefix = "" @property def dummy_inputs(self): """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ return {"input_ids": tf.constant(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) if not isinstance(config, PretrainedConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " "To create a model from a pretrained model use " "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ ) ) # Save config in model self.config = config def get_input_embeddings(self): """ Returns the model's input embeddings. Returns: :obj:`tf.keras.layers.Layer`: A torch module mapping vocabulary to hidden states. """ base_model = getattr(self, self.base_model_prefix, self) if base_model is not self: return base_model.get_input_embeddings() else: raise NotImplementedError def get_output_embeddings(self): """ Returns the model's output embeddings. Returns: :obj:`tf.keras.layers.Layer`: A torch module mapping hidden states to vocabulary. """ return None # Overwrite for models with output embeddings def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): """ Build a resized Embedding Variable from a provided token Embedding Module. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end Args: new_num_tokens: (`optional`) int New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end If not provided or None: return the provided token Embedding Module. Return: ``tf.Variable`` Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None """ # if new_num_tokens is None: # return old_embeddings # old_num_tokens, old_embedding_dim = old_embeddings.weight.size() # if old_num_tokens == new_num_tokens: # return old_embeddings # # Build new embeddings # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) # new_embeddings.to(old_embeddings.weight.device) # # initialize all new embeddings (in particular added tokens) # self._init_weights(new_embeddings) # # Copy token embeddings from the previous weights # num_tokens_to_copy = min(old_num_tokens, new_num_tokens) # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] # return new_embeddings def resize_token_embeddings(self, new_num_tokens=None): """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Arguments: new_num_tokens: (`optional`) int: New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model. Return: ``tf.Variable`` Pointer to the input tokens Embeddings Module of the model """ raise NotImplementedError def prune_heads(self, heads_to_prune): """ Prunes heads of the base model. Arguments: heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). """ raise NotImplementedError def save_pretrained(self, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the :func:`~transformers1.PreTrainedModel.from_pretrained` class method. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # Save configuration file self.config.save_pretrained(save_directory) # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME) self.save_weights(output_model_file) logger.info("Model weights saved in {}".format(output_model_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration. The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) one of: - an instance of a class derived from :class:`~transformers1.PretrainedConfig`, or - a string valid as input to :func:`~transformers1.PretrainedConfig.from_pretrained()` Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. from_pt: (`optional`) boolean, default False: Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument). cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: # For example purposes. Not runnable. model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config) """ config = kwargs.pop("config", None) cache_dir = kwargs.pop("cache_dir", None) from_pt = kwargs.pop("from_pt", False) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) use_cdn = kwargs.pop("use_cdn", True) # Load config if we don't provide a configuration if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( config_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, **kwargs, ) else: model_kwargs = kwargs # Load model if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): # Load from a TF 2.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: raise EnvironmentError( "Error no file named {} found in directory {} or `from_pt` set to False".format( [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path ) ) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): archive_file = pretrained_model_name_or_path + ".index" else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=(WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME), use_cdn=use_cdn, ) try: # Load from URL or cache if already cached resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, proxies=proxies, ) if resolved_archive_file is None: raise EnvironmentError except EnvironmentError: msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {TF2_WEIGHTS_NAME}, {WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) else: resolved_archive_file = None # Instantiate model. model = cls(config, *model_args, **model_kwargs) if from_pt: # Load from a PyTorch checkpoint return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True) model(model.dummy_inputs, training=False) # build the network with dummy inputs assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file) # 'by_name' allow us to do transfer learning by skipping/adding layers # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357 try: model.load_weights(resolved_archive_file, by_name=True) except OSError: raise OSError( "Unable to load weights from h5 file. " "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. " ) model(model.dummy_inputs, training=False) # Make sure restore ops are run # Check if the models are the same to output loading informations with h5py.File(resolved_archive_file, "r") as f: if "layer_names" not in f.attrs and "model_weights" in f: f = f["model_weights"] hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names")) model_layer_names = set(layer.name for layer in model.layers) missing_keys = list(model_layer_names - hdf5_layer_names) unexpected_keys = list(hdf5_layer_names - model_layer_names) error_msgs = [] if len(missing_keys) > 0: logger.info( "Layers of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys) ) if len(unexpected_keys) > 0: logger.info( "Layers from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys) ) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) ) if output_loading_info: loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs} return model, loading_info return model def prepare_inputs_for_generation(self, inputs, **kwargs): return {"inputs": inputs} def _use_cache(self, outputs, use_cache): """During generation, decide whether to pass the `past` variable to the next forward pass.""" if len(outputs) <= 1 or use_cache is False: return False if hasattr(self.config, "mem_len") and self.config.mem_len == 0: return False return True def generate( self, input_ids=None, max_length=None, min_length=None, do_sample=None, early_stopping=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, bad_words_ids=None, bos_token_id=None, pad_token_id=None, eos_token_id=None, length_penalty=None, no_repeat_ngram_size=None, num_return_sequences=None, attention_mask=None, decoder_start_token_id=None, use_cache=None, ): r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling and beam-search. Adapted in part from `Facebook's XLM beam search code`_. .. _`Facebook's XLM beam search code`: https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 Parameters: input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)` The sequence used as a prompt for the generation. If `None` the method initializes it as an empty `tf.Tensor` of shape `(1,)`. max_length: (`optional`) int The max length of the sequence to be generated. Between 1 and infinity. Default to 20. min_length: (`optional`) int The min length of the sequence to be generated. Between 0 and infinity. Default to 0. do_sample: (`optional`) bool If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. early_stopping: (`optional`) bool if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. num_beams: (`optional`) int Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. temperature: (`optional`) float The value used to module the next token probabilities. Must be strictely positive. Default to 1.0. top_k: (`optional`) int The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. top_p: (`optional`) float The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. repetition_penalty: (`optional`) float The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. bos_token_id: (`optional`) int Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist. pad_token_id: (`optional`) int Pad token. Defaults to pad_token_id as defined in the models config. eos_token_id: (`optional`) int EOS token. Defaults to eos_token_id as defined in the models config. length_penalty: (`optional`) float Exponential penalty to the length. Default to 1. no_repeat_ngram_size: (`optional`) int If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. bad_words_ids: (`optional`) list of lists of int `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. num_return_sequences: (`optional`) int The number of independently computed returned sequences for each element in the batch. Default to 1. attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids` Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. Defaults to `None`. `What are attention masks? <../glossary.html#attention-mask>`__ decoder_start_token_id=None: (`optional`) int If an encoder-decoder model starts decoding with a different token than BOS. Defaults to `None` and is changed to `BOS` later. use_cache: (`optional`) bool If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. Return: output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)` sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` Examples:: tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. outputs = model.generate(max_length=40) # do greedy decoding print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated """ # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError( "You tried to generate sequences with a model that does not have a LM Head." "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" ) max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping use_cache = use_cache if use_cache is not None else self.config.use_cache num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty no_repeat_ngram_size = ( no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size ) bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids num_return_sequences = ( num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences ) decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) if input_ids is not None: batch_size = shape_list(input_ids)[0] # overriden by the input batch_size else: batch_size = 1 assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(use_cache, bool), "`use_cache` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." assert temperature > 0, "`temperature` should be strictely positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0) ), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0) ), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictely positive." assert ( isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictely positive integer." assert ( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" if input_ids is None: assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) input_ids = tf.fill((batch_size, 1), bos_token_id) else: assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) elif attention_mask is None: attention_mask = tf.ones_like(input_ids) if pad_token_id is None and eos_token_id is not None: logger.warning( "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) ) pad_token_id = eos_token_id # current position and vocab size cur_len = shape_list(input_ids)[1] vocab_size = self.config.vocab_size # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if self.config.is_encoder_decoder: if decoder_start_token_id is None: decoder_start_token_id = bos_token_id assert ( decoder_start_token_id is not None ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) # get encoder and store encoder outputs encoder = self.get_encoder() encoder_outputs = encoder(input_ids, attention_mask=attention_mask) # Expand input ids if num_beams > 1 or num_return_sequences > 1 if num_return_sequences > 1 or num_beams > 1: input_ids_len = shape_list(input_ids)[-1] input_ids = tf.broadcast_to( tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) attention_mask = tf.broadcast_to( tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) input_ids = tf.reshape( input_ids, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) attention_mask = tf.reshape( attention_mask, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) if self.config.is_encoder_decoder: # create empty decoder_input_ids input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id cur_len = 1 assert ( batch_size == encoder_outputs[0].shape[0] ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) expanded_batch_idxs = tf.reshape( tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), shape=(-1,), ) # expand encoder_outputs encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0), *encoder_outputs[1:]) else: encoder_outputs = None cur_len = shape_list(input_ids)[-1] if num_beams > 1: output = self._generate_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, eos_token_id=eos_token_id, decoder_start_token_id=decoder_start_token_id, batch_size=effective_batch_size, num_return_sequences=num_return_sequences, length_penalty=length_penalty, num_beams=num_beams, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, ) else: output = self._generate_no_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, eos_token_id=eos_token_id, decoder_start_token_id=decoder_start_token_id, batch_size=effective_batch_size, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, ) return output def _generate_no_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, decoder_start_token_id, batch_size, vocab_size, encoder_outputs, attention_mask, use_cache, ): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ # length of generated sentences / unfinished sentences unfinished_sents = tf.ones_like(input_ids[:, 0]) sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache ) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: next_token_logits_penalties = _create_next_token_logits_penalties( input_ids, next_token_logits, repetition_penalty ) next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) # create banned_tokens boolean mask banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) next_token_logits = set_tensor_by_indices_to_value( next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) next_token_logits = set_tensor_by_indices_to_value( next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: # create eos_token_id boolean mask is_token_logit_eos_token = tf.convert_to_tensor( [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool ) eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size]) next_token_logits = set_tensor_by_indices_to_value( next_token_logits, eos_token_indices_mask, -float("inf") ) if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature # Top-p/top-k filtering next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) # Sample next_token = tf.squeeze( tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 ) else: # Greedy decoding next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) # update generations and finished sentences if eos_token_id is not None: # pad finished sentences if eos_token_id exist tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) else: tokens_to_add = next_token # add token and increase length by one input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) cur_len = cur_len + 1 if eos_token_id is not None: eos_in_sents = tokens_to_add == eos_token_id # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( unfinished_sents, tf.cast(eos_in_sents, tf.int32) ) sent_lengths = ( sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + cur_len * is_sents_unfinished_and_token_to_add_is_eos ) # unfinished_sents is set to zero if eos in sentence unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos # stop when there is a in each sentence, or if we exceed the maximul length if tf.math.reduce_max(unfinished_sents) == 0: break # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = tf.concat( [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 ) # if there are different sentences lengths in the batch, some batches have to be padded min_sent_length = tf.math.reduce_min(sent_lengths) max_sent_length = tf.math.reduce_max(sent_lengths) if min_sent_length != max_sent_length: assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" # finished sents are filled with pad_token padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id # create length masks for tf.where operation broad_casted_sent_lengths = tf.broadcast_to( tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] ) broad_casted_range = tf.transpose( tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) ) decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) else: decoded = input_ids return decoded def _generate_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, early_stopping, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, decoder_start_token_id, eos_token_id, batch_size, num_return_sequences, length_penalty, num_beams, vocab_size, encoder_outputs, attention_mask, use_cache, ): """ Generate sequences for each example with beam search. """ # generated hypotheses generated_hyps = [ BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) for _ in range(batch_size) ] # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times if do_sample is False: beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32) beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9) beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1) else: beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32) beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,)) # cache compute states past = encoder_outputs # done sentences done = [False for _ in range(batch_size)] while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache ) outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: next_token_logits_penalties = _create_next_token_logits_penalties( input_ids, next_token_logits, repetition_penalty ) next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature # calculate log softmax score scores = tf.nn.log_softmax(next_token_logits, axis=-1) # (batch_size * num_beams, vocab_size) # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: # create eos_token_id boolean mask num_batch_hypotheses = batch_size * num_beams is_token_logit_eos_token = tf.convert_to_tensor( [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool ) eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size]) scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf")) if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 num_batch_hypotheses = batch_size * num_beams banned_tokens = calc_banned_ngram_tokens( input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len ) # create banned_tokens boolean mask banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) scores = set_tensor_by_indices_to_value( scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) scores = set_tensor_by_indices_to_value( scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) assert shape_list(scores) == [batch_size * num_beams, vocab_size] if do_sample: _scores = scores + tf.broadcast_to( beam_scores[:, None], (batch_size * num_beams, vocab_size) ) # (batch_size * num_beams, vocab_size) # Top-p/top-k filtering _scores = tf_top_k_top_p_filtering( _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 ) # (batch_size * num_beams, vocab_size) # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size)) next_tokens = tf.random.categorical( _scores, dtype=tf.int32, num_samples=2 * num_beams ) # (batch_size, 2 * num_beams) # Compute next scores next_scores = tf.gather(_scores, next_tokens, batch_dims=1) # (batch_size, 2 * num_beams) # sort the sampled vector to make sure that the first num_beams samples are the best next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1) next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) else: # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) next_scores = scores + tf.broadcast_to( beam_scores[:, None], (batch_size * num_beams, vocab_size) ) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) next_scores = tf.reshape( next_scores, (batch_size, num_beams * vocab_size) ) # (batch_size, num_beams * vocab_size) next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True) assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams] # next batch beam content next_batch_beam = [] # for each sentence for batch_idx in range(batch_size): # if we are done with this sentence if done[batch_idx]: assert ( len(generated_hyps[batch_idx]) >= num_beams ), "Batch can only be done if at least {} beams have been generated".format(num_beams) assert ( eos_token_id is not None and pad_token_id is not None ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue # next sentence beam content next_sent_beam = [] # next tokens for this sentence for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens[batch_idx], next_scores[batch_idx]) ): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size effective_beam_id = batch_idx * num_beams + beam_id # add to generated hypotheses if end of sentence or last iteration if (eos_token_id is not None) and (token_id.numpy() == eos_token_id): # if beam_token does not belong to top num_beams tokens, it should not be added is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams if is_beam_token_worse_than_top_num_beams: continue generated_hyps[batch_idx].add( tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy() ) else: # add next predicted token if it is not eos_token next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) # the beam for next step is full if len(next_sent_beam) == num_beams: break # Check if were done so that we can save a pad step if all(done) done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len=cur_len ) # update next beam content assert len(next_sent_beam) == num_beams, "Beam should always be full" next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == num_beams * (batch_idx + 1) # stop when we are done with each sentence if all(done): break # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32) beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32) beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32) # re-order batch and update current length input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx]) input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1) cur_len = cur_len + 1 # re-order internal states if past is not None: past = self._reorder_cache(past, beam_idx) # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = tf.concat( [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 ) # finalize all open beam hypotheses and end to generated hypotheses for batch_idx in range(batch_size): # Add all open beam hypothesis to generated_hyps if done[batch_idx]: continue # test that beam scores match previously calculated scores if not eos and batch_idx not done if eos_token_id is not None and all( (token_id % vocab_size).numpy().item() is not eos_token_id for token_id in next_tokens[batch_idx] ): assert tf.reduce_all( next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] ) # need to add best num_beams hypotheses to generated hyps for beam_id in range(num_beams): effective_beam_id = batch_idx * num_beams + beam_id final_score = beam_scores[effective_beam_id].numpy().item() final_tokens = input_ids[effective_beam_id] generated_hyps[batch_idx].add(final_tokens, final_score) # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch output_batch_size = batch_size if do_sample else batch_size * num_return_sequences output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences # select the best hypotheses sent_lengths_list = [] best = [] # retrieve best hypotheses for i, hypotheses in enumerate(generated_hyps): sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) for j in range(output_num_return_sequences_per_batch): best_hyp = sorted_hyps.pop()[1] sent_lengths_list.append(len(best_hyp)) best.append(best_hyp) assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format( output_batch_size, len(best) ) sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32) # shorter batches are filled with pad_token if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy(): assert pad_token_id is not None, "`Pad_token_id` has to be defined" sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length) decoded_list = [] # fill with hypothesis and eos_token_id if necessary for i, hypo in enumerate(best): assert sent_lengths[i] == shape_list(hypo)[0] # if sent_length is max_len do not pad if sent_lengths[i] == sent_max_len: decoded_slice = hypo else: # else pad to sent_max_len num_pad_tokens = sent_max_len - sent_lengths[i] padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32) decoded_slice = tf.concat([hypo, padding], axis=-1) # finish sentence with EOS token if sent_lengths[i] < max_length: decoded_slice = tf.where( tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i], eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32), decoded_slice, ) # add to list decoded_list.append(decoded_slice) decoded = tf.stack(decoded_list) else: # none of the hypotheses have an eos_token assert (len(hypo) == max_length for hypo in best) decoded = tf.stack(best) return decoded @staticmethod def _reorder_cache(past, beam_idx): return tuple(tf.gather(layer_past, beam_idx, axis=1) for layer_past in past) def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): # create logit penalties for already seen input_ids token_penalties = np.ones(shape_list(logits)) prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] for i, prev_input_id in enumerate(prev_input_ids): logit_penalized = logits[i].numpy()[prev_input_id] logit_penalties = np.zeros(logit_penalized.shape) # if previous logit score is < 0 then multiply repetition penalty else divide logit_penalties[logit_penalized < 0] = repetition_penalty logit_penalties[logit_penalized > 0] = 1 / repetition_penalty np.put(token_penalties[i], prev_input_id, logit_penalties) return tf.convert_to_tensor(token_penalties, dtype=tf.float32) def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): # Copied from fairseq for no_repeat_ngram in beam_search""" if cur_len + 1 < no_repeat_ngram_size: # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): gen_tokens = prev_input_ids[idx].numpy().tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - no_repeat_ngram_size ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] return banned_tokens def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): banned_tokens = [] def _tokens_match(prev_tokens, tokens): if len(tokens) == 0: # if bad word tokens is just one token always ban it return True if len(tokens) > len(prev_input_ids): # if bad word tokens are longer then prev input_ids they can't be equal return False if prev_tokens[-len(tokens) :] == tokens: # if tokens match return True else: return False for prev_input_ids_slice in prev_input_ids: banned_tokens_slice = [] for banned_token_seq in bad_words_ids: assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( bad_words_ids ) if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: # if tokens do not match continue continue banned_tokens_slice.append(banned_token_seq[-1]) banned_tokens.append(banned_tokens_slice) return banned_tokens def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) if top_k > 0: keep only top k tokens with highest probability (top-k filtering). if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) Make sure we keep at least min_tokens_to_keep per batch example in the output From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ logits_shape = shape_list(logits) if top_k > 0: top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) if top_p < 1.0: sorted_indices = tf.argsort(logits, direction="DESCENDING") sorted_logits = tf.gather( logits, sorted_indices, axis=-1, batch_dims=1 ) # expects logits to be of dim (batch_size, vocab_size) cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) sorted_indices_to_remove = tf.concat( [ tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), sorted_indices_to_remove[:, min_tokens_to_keep:], ], -1, ) # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1) sorted_indices_to_remove = tf.concat( [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1, ) # scatter sorted tensors to original indexing indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) return logits def scatter_values_on_batch_indices(values, batch_indices): shape = shape_list(batch_indices) # broadcast batch dim to shape broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) # transform batch_indices to pair_indices pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) # scatter values to pair indices return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) def set_tensor_by_indices_to_value(tensor, indices, value): # create value_tensor since tensor value assignment is not possible in TF value_tensor = tf.zeros_like(tensor) + value return tf.where(indices, value_tensor, tensor) class BeamHypotheses(object): def __init__(self, num_beams, max_length, length_penalty, early_stopping): """ Initialize n-best list of hypotheses. """ self.max_length = max_length - 1 # ignoring bos_token self.length_penalty = length_penalty self.early_stopping = early_stopping self.num_beams = num_beams self.beams = [] self.worst_score = 1e9 def __len__(self): """ Number of hypotheses in the list. """ return len(self.beams) def add(self, hyp, sum_logprobs): """ Add a new hypothesis to the list. """ score = sum_logprobs / len(hyp) ** self.length_penalty if len(self) < self.num_beams or score > self.worst_score: self.beams.append((score, hyp)) if len(self) > self.num_beams: sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) del self.beams[sorted_scores[0][1]] self.worst_score = sorted_scores[1][0] else: self.worst_score = min(score, self.worst_score) def is_done(self, best_sum_logprobs, cur_len=None): """ If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst one in the heap, then we are done with this sentence. """ if len(self) < self.num_beams: return False elif self.early_stopping: return True else: if cur_len is None: cur_len = self.max_length cur_score = best_sum_logprobs / cur_len ** self.length_penalty ret = self.worst_score >= cur_score return ret class TFConv1D(tf.keras.layers.Layer): def __init__(self, nf, nx, initializer_range=0.02, **kwargs): """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) Basically works like a Linear layer but the weights are transposed """ super().__init__(**kwargs) self.nf = nf self.nx = nx self.initializer_range = initializer_range def build(self, input_shape): self.weight = self.add_weight( "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) ) self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) def call(self, x): bz, sl = shape_list(x)[:2] x = tf.reshape(x, [-1, self.nx]) x = tf.matmul(x, self.weight) + self.bias x = tf.reshape(x, [bz, sl, self.nf]) return x class TFSharedEmbeddings(tf.keras.layers.Layer): """Construct shared token embeddings. """ def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs): super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range def build(self, input_shape): """Build shared token embedding layer Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ self.weight = self.add_weight( "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) ) super().build(input_shape) def call(self, inputs, mode="embedding"): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, input_ids): """Applies embedding based on inputs tensor.""" return tf.gather(self.weight, input_ids) def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [..., hidden_size] Returns: float32 tensor with shape [..., vocab_size]. """ first_dims = shape_list(inputs)[:-1] x = tf.reshape(inputs, [-1, self.hidden_size]) logits = tf.matmul(x, self.weight, transpose_b=True) return tf.reshape(logits, first_dims + [self.vocab_size]) class TFSequenceSummary(tf.keras.layers.Layer): r""" Compute a single vector summary of a sequence hidden states according to various possibilities: Args of the config class: summary_type: - 'last' => [default] take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj: Add a projection after the vector extraction summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ def __init__(self, config, initializer_range=0.02, **kwargs): super().__init__(**kwargs) self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" if self.summary_type == "attn": # We should use a standard multi-head attention module with absolute positional embedding for that. # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # We can probably just use the multi-head attention module of PyTorch >=1.1.0 raise NotImplementedError self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj if self.has_summary: if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: num_classes = config.num_labels else: num_classes = config.hidden_size self.summary = tf.keras.layers.Dense( num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" ) self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh" if self.has_activation: self.activation = tf.keras.activations.tanh self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 if self.has_first_dropout: self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 if self.has_last_dropout: self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) def call(self, inputs, training=False): """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer. cls_index: [optional] position of the classification token if summary_type == 'cls_index', shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. if summary_type == 'cls_index' and cls_index is None: we take the last token of the sequence as classification token """ if not isinstance(inputs, (dict, tuple, list)): hidden_states = inputs cls_index = None elif isinstance(inputs, (tuple, list)): hidden_states = inputs[0] cls_index = inputs[1] if len(inputs) > 1 else None assert len(inputs) <= 2, "Too many inputs." else: hidden_states = inputs.get("hidden_states") cls_index = inputs.get("cls_index", None) if self.summary_type == "last": output = hidden_states[:, -1] elif self.summary_type == "first": output = hidden_states[:, 0] elif self.summary_type == "mean": output = tf.reduce_mean(hidden_states, axis=1) elif self.summary_type == "cls_index": hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims] if cls_index is None: cls_index = tf.fill( hidden_shape[:-2], hidden_shape[-2] - 1 ) # A tensor full of shape [batch] or [batch, num choices] full of sequence length cls_shape = shape_list(cls_index) if len(cls_shape) <= len(hidden_shape) - 2: cls_index = cls_index[..., tf.newaxis] # else: # cls_index = cls_index[..., tf.newaxis] # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2) output = tf.squeeze( output, axis=len(hidden_shape) - 2 ) # shape of output: (batch, num choices, hidden_size) elif self.summary_type == "attn": raise NotImplementedError if self.has_first_dropout: output = self.first_dropout(output, training=training) if self.has_summary: output = self.summary(output) if self.has_activation: output = self.activation(output) if self.has_last_dropout: output = self.last_dropout(output, training=training) return output def shape_list(x): """Deal with dynamic shape in tensorflow cleanly.""" static = x.shape.as_list() dynamic = tf.shape(x) return [dynamic[i] if s is None else s for i, s in enumerate(static)] def get_initializer(initializer_range=0.02): """Creates a `tf.initializers.truncated_normal` with the given range. Args: initializer_range: float, initializer range for stddev. Returns: TruncatedNormal initializer with stddev = `initializer_range`. """ return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_xlm.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 XLM model. """ import itertools import logging import math import numpy as np import tensorflow as tf from .configuration_xlm import XLMConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlm-mlm-en-2048", "xlm-mlm-ende-1024", "xlm-mlm-enfr-1024", "xlm-mlm-enro-1024", "xlm-mlm-tlm-xnli15-1024", "xlm-mlm-xnli15-1024", "xlm-clm-enfr-1024", "xlm-clm-ende-1024", "xlm-mlm-17-1280", "xlm-mlm-100-1280", # See all XLM models at https://huggingface.co/models?filter=xlm ] def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2])) out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2])) def gelu(x): """ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32): """ Generate hidden states mask, and optionally an attention mask. """ bs = shape_list(lengths)[0] if padding_mask is not None: mask = padding_mask else: # assert lengths.max().item() <= slen alen = tf.range(slen) mask = tf.math.less(alen, lengths[:, tf.newaxis]) # attention mask is the same as mask, or triangular inferior attention (causal) if causal: attn_mask = tf.less_equal( tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis] ) else: attn_mask = mask # sanity check # assert shape_list(mask) == [bs, slen] tf.debugging.assert_equal(shape_list(mask), [bs, slen]) assert causal is False or shape_list(attn_mask) == [bs, slen, slen] mask = tf.cast(mask, dtype=dtype) attn_mask = tf.cast(attn_mask, dtype=dtype) return mask, attn_mask class TFMultiHeadAttention(tf.keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config, **kwargs): super().__init__(**kwargs) self.layer_id = next(TFMultiHeadAttention.NEW_ID) self.output_attentions = config.output_attentions self.dim = dim self.n_heads = n_heads assert self.dim % self.n_heads == 0 self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ input, mask, kv, cache, head_mask = inputs # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = shape_list(input) if kv is None: klen = qlen if cache is None else cache["slen"] + qlen else: klen = shape_list(kv)[1] # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) n_heads = self.n_heads dim_per_head = self.dim // n_heads mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen) def shape(x): """ projection """ return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) def unshape(x): """ compute context """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) scores = scores - 1e30 * (1.0 - mask) weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) outputs = (self.out_lin(context),) if self.output_attentions: outputs = outputs + (weights,) return outputs class TFTransformerFFN(tf.keras.layers.Layer): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): super().__init__(**kwargs) self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu self.dropout = tf.keras.layers.Dropout(config.dropout) def call(self, input, training=False): x = self.lin1(input) x = self.act(x) x = self.lin2(x) x = self.dropout(x, training=training) return x class TFXLMMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states # encoder / decoder, output layer self.is_encoder = config.is_encoder self.is_decoder = not config.is_encoder if self.is_decoder: raise NotImplementedError("Currently XLM can only be used as an encoder") # self.with_output = with_output self.causal = config.causal # dictionary / languages self.n_langs = config.n_langs self.use_lang_emb = config.use_lang_emb self.n_words = config.n_words self.eos_index = config.eos_index self.pad_index = config.pad_index # self.dico = dico # self.id2lang = config.id2lang # self.lang2id = config.lang2id # assert len(self.dico) == self.n_words # assert len(self.id2lang) == len(self.lang2id) == self.n_langs # model parameters self.dim = config.emb_dim # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default self.n_heads = config.n_heads # 8 by default self.n_layers = config.n_layers assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" # embeddings self.dropout = tf.keras.layers.Dropout(config.dropout) self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, self.dim, embeddings_initializer=get_initializer(config.embed_init_std), name="position_embeddings", ) if config.sinusoidal_embeddings: raise NotImplementedError # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) if config.n_langs > 1 and config.use_lang_emb: self.lang_embeddings = tf.keras.layers.Embedding( self.n_langs, self.dim, embeddings_initializer=get_initializer(config.embed_init_std), name="lang_embeddings", ) self.embeddings = TFSharedEmbeddings( self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings" ) # padding_idx=self.pad_index) self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") # transformer layers self.attentions = [] self.layer_norm1 = [] self.ffns = [] self.layer_norm2 = [] # if self.is_decoder: # self.layer_norm15 = [] # self.encoder_attn = [] for i in range(self.n_layers): self.attentions.append( TFMultiHeadAttention(self.n_heads, self.dim, config=config, name="attentions_._{}".format(i)) ) self.layer_norm1.append( tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1_._{}".format(i)) ) # if self.is_decoder: # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) self.ffns.append( TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name="ffns_._{}".format(i)) ) self.layer_norm2.append( tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2_._{}".format(i)) ) if hasattr(config, "pruned_heads"): pruned_heads = config.pruned_heads.copy().items() config.pruned_heads = {} for layer, heads in pruned_heads: if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, training=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask langs = inputs[2] if len(inputs) > 2 else langs token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids lengths = inputs[5] if len(inputs) > 5 else lengths cache = inputs[6] if len(inputs) > 6 else cache head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) langs = inputs.get("langs", langs) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) lengths = inputs.get("lengths", lengths) cache = inputs.get("cache", cache) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: bs, slen = shape_list(input_ids) elif inputs_embeds is not None: bs, slen = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if lengths is None: if input_ids is not None: lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) else: lengths = tf.convert_to_tensor([slen] * bs, tf.int32) # mask = input_ids != self.pad_index # check inputs # assert shape_list(lengths)[0] == bs tf.debugging.assert_equal(shape_list(lengths)[0], bs) # assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # position_ids if position_ids is None: position_ids = tf.expand_dims(tf.range(slen), axis=0) else: # assert shape_list(position_ids) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: # assert shape_list(langs) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(langs), [bs, slen]) # langs = langs.transpose(0, 1) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layers # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids) if langs is not None and self.use_lang_emb and self.n_langs > 1: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = self.dropout(tensor, training=training) tensor = tensor * mask[..., tf.newaxis] # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) tensor = tensor * mask[..., tf.newaxis] # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) class TFXLMPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLMConfig base_model_prefix = "transformer" @property def dummy_inputs(self): # Sometimes XLM has language embeddings so don't forget to build them as well if needed inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) if self.config.use_lang_emb and self.config.n_langs > 1: langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) else: langs_list = None return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} XLM_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.XLMConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLM_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are languages ids which can be obtained from the language names by using two conversion mappings provided in the configuration of the model (only provided for multilingual models). More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). See usage examples detailed in the `multilingual documentation `__. token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`): dictionary with ``tf.Tensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", XLM_START_DOCSTRING, ) class TFXLMModel(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMModel tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMModel.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs class TFXLMPredLayer(tf.keras.layers.Layer): """ Prediction layer (cross_entropy or adaptive_softmax). """ def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.asm = config.asm self.n_words = config.n_words self.pad_index = config.pad_index if config.asm is False: self.input_embeddings = input_embeddings else: raise NotImplementedError # self.proj = nn.AdaptiveLogSoftmaxWithLoss( # in_features=dim, # n_classes=config.n_words, # cutoffs=config.asm_cutoffs, # div_value=config.asm_div_value, # head_bias=True, # default is False # ) def build(self, input_shape): # The output weights are the same as the input embeddings, but there is an output-only bias for each token. self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLM_START_DOCSTRING, ) class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") def get_output_embeddings(self): return self.pred_layer.input_embeddings def prepare_inputs_for_generation(self, inputs, **kwargs): mask_token_id = self.config.mask_token_id lang_id = self.config.lang_id effective_batch_size = inputs.shape[0] mask_token = tf.ones((effective_batch_size, 1), dtype=tf.int32) * mask_token_id inputs = tf.concat([inputs, mask_token], axis=1) if lang_id is not None: langs = tf.ones_like(inputs) * lang_id else: langs = None return {"inputs": inputs, "langs": langs} @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMWithLMHeadModel tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] outputs = self.pred_layer(output) outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_START_DOCSTRING, ) class TFXLMForSequenceClassification(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLMMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMForSequenceClassification tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.constant([1])[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] logits = self.sequence_summary(output) outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLM_START_DOCSTRING, ) class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMForQuestionAnsweringSimple tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] """ transformer_outputs = self.transformer(inputs, **kwargs) sequence_output = transformer_outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + transformer_outputs[ 1: ] # Keep mems, hidden states, attentions if there are in it return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 XLM-RoBERTa model. """ import logging from .configuration_xlm_roberta import XLMRobertaConfig from .file_utils import add_start_docstrings from .modeling_tf_roberta import ( TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaModel, ) logger = logging.getLogger(__name__) TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta ] XLM_ROBERTA_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaModel(TFRobertaModel): """ This class overrides :class:`~transformers1.TFRobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM): """ This class overrides :class:`~transformers1.TFRobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification): """ This class overrides :class:`~transformers1.TFRobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification): """ This class overrides :class:`~transformers1.TFRobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_tf_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 XLNet model. """ import logging import numpy as np import tensorflow as tf from .configuration_xlnet import XLNetConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, keras_serializable, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlnet-base-cased", "xlnet-large-cased", # See all XLNet models at https://huggingface.co/models?filter=xlnet ] def gelu(x): """ Implementation of the gelu activation function. XLNet is using OpenAI GPT's gelu Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf def swish(x): return x * tf.sigmoid(x) ACT2FN = { "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.layers.Activation(swish), } class TFXLNetRelativeAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions if config.d_model % config.n_head != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.d_model, config.n_head) ) self.n_head = config.n_head self.d_head = config.d_head self.d_model = config.d_model self.scale = 1 / (config.d_head ** 0.5) self.initializer_range = config.initializer_range self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): initializer = get_initializer(self.initializer_range) self.q = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q" ) self.k = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k" ) self.v = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v" ) self.o = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o" ) self.r = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r" ) self.r_r_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" ) self.r_s_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias" ) self.r_w_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" ) self.seg_embed = self.add_weight( shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" ) super().build(input_shape) def prune_heads(self, heads): raise NotImplementedError def rel_shift(self, x, klen=-1): """perform relative shift to form the relative attention score.""" x_size = shape_list(x) x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3])) x = x[1:, ...] x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3])) x = x[:, 0:klen, :, :] # x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long)) return x def rel_attn_core(self, inputs, training=False): """Core relative positional attention operations.""" q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs # content based attention score ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h) # position based attention score bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r) bd = self.rel_shift(bd, klen=shape_list(ac)[1]) # segment based attention score if seg_mat is None: ef = 0 else: ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed) ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef) # merge attention scores and perform masking attn_score = (ac + bd + ef) * self.scale if attn_mask is not None: # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask if attn_mask.dtype == tf.float16: attn_score = attn_score - 65500 * attn_mask else: attn_score = attn_score - 1e30 * attn_mask # attention probability attn_prob = tf.nn.softmax(attn_score, axis=1) attn_prob = self.dropout(attn_prob, training=training) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * head_mask # attention output attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h) if self.output_attentions: return attn_vec, attn_prob return attn_vec def post_attention(self, inputs, residual=True, training=False): """Post-attention processing.""" # post-attention projection (back to `d_model`) h, attn_vec = inputs attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o) attn_out = self.dropout(attn_out, training=training) if residual: attn_out = attn_out + h output = self.layer_norm(attn_out) return output def call(self, inputs, training=False): (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs if g is not None: # Two-stream attention with relative positional encoding. # content based attention score if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) else: cat = h # content-based key head k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k) # content-based value head v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v) # position-based key head k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r) # h-stream # content-stream query head q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q) # core attention ops attn_vec_h = self.rel_attn_core( [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training ) if self.output_attentions: attn_vec_h, attn_prob_h = attn_vec_h # post processing output_h = self.post_attention([h, attn_vec_h], training=training) # g-stream # query-stream query head q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q) # core attention ops if target_mapping is not None: q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g # post processing output_g = self.post_attention([g, attn_vec_g], training=training) if self.output_attentions: attn_prob = attn_prob_h, attn_prob_g else: # Multi-head attention with relative positional encoding if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) else: cat = h # content heads q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q) k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k) v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v) # positional heads k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r) # core attention ops attn_vec = self.rel_attn_core( [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training ) if self.output_attentions: attn_vec, attn_prob = attn_vec # post processing output_h = self.post_attention([h, attn_vec], training=training) output_g = None outputs = (output_h, output_g) if self.output_attentions: outputs = outputs + (attn_prob,) return outputs class TFXLNetFeedForward(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_1 = tf.keras.layers.Dense( config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1" ) self.layer_2 = tf.keras.layers.Dense( config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2" ) self.dropout = tf.keras.layers.Dropout(config.dropout) if isinstance(config.ff_activation, str): self.activation_function = ACT2FN[config.ff_activation] else: self.activation_function = config.ff_activation def call(self, inp, training=False): output = inp output = self.layer_1(output) output = self.activation_function(output) output = self.dropout(output, training=training) output = self.layer_2(output) output = self.dropout(output, training=training) output = self.layer_norm(output + inp) return output class TFXLNetLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn") self.ff = TFXLNetFeedForward(config, name="ff") self.dropout = tf.keras.layers.Dropout(config.dropout) def call(self, inputs, training=False): outputs = self.rel_attn(inputs, training=training) output_h, output_g = outputs[:2] if output_g is not None: output_g = self.ff(output_g, training=training) output_h = self.ff(output_h, training=training) outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there return outputs class TFXLNetLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @keras_serializable class TFXLNetMainLayer(tf.keras.layers.Layer): config_class = XLNetConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.mem_len = config.mem_len self.reuse_len = config.reuse_len self.d_model = config.d_model self.same_length = config.same_length self.attn_type = config.attn_type self.bi_data = config.bi_data self.clamp_len = config.clamp_len self.n_layer = config.n_layer self.use_bfloat16 = config.use_bfloat16 self.initializer_range = config.initializer_range self.word_embedding = TFSharedEmbeddings( config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding" ) self.layer = [TFXLNetLayer(config, name="layer_._{}".format(i)) for i in range(config.n_layer)] self.dropout = tf.keras.layers.Dropout(config.dropout) def get_input_embeddings(self): return self.word_embedding def build(self, input_shape): initializer = get_initializer(self.initializer_range) self.mask_emb = self.add_weight( shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb" ) def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): raise NotImplementedError def create_mask(self, qlen, mlen, dtype=tf.float32): """ Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. Args: qlen: TODO Lysandre didn't fill mlen: TODO Lysandre didn't fill :: same_length=False: same_length=True: < qlen > < qlen > ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1] qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1] [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1] v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0] """ attn_mask = tf.ones([qlen, qlen], dtype=dtype) mask_u = tf.matrix_band_part(attn_mask, 0, -1) mask_dia = tf.matrix_band_part(attn_mask, 0, 0) attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype) ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) if self.same_length: mask_l = tf.matrix_band_part(attn_mask, -1, 0) ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1) return ret def cache_mem(self, curr_out, prev_mem): """cache hidden states into memory.""" if self.reuse_len is not None and self.reuse_len > 0: curr_out = curr_out[: self.reuse_len] if prev_mem is None: new_mem = curr_out[-self.mem_len :] else: new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len :] return tf.stop_gradient(new_mem) @staticmethod def positional_embedding(pos_seq, inv_freq, bsz=None): sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq) pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1) pos_emb = pos_emb[:, None, :] if bsz is not None: pos_emb = tf.tile(pos_emb, [1, bsz, 1]) return pos_emb def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=None): """create relative positional encoding.""" freq_seq = tf.range(0, self.d_model, 2.0) if dtype is not None and dtype != tf.float32: freq_seq = tf.cast(freq_seq, dtype=dtype) inv_freq = 1 / (10000 ** (freq_seq / self.d_model)) if self.attn_type == "bi": # beg, end = klen - 1, -qlen beg, end = klen, -qlen elif self.attn_type == "uni": # beg, end = klen - 1, -1 beg, end = klen, -1 else: raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) if self.bi_data: fwd_pos_seq = tf.range(beg, end, -1.0) bwd_pos_seq = tf.range(-beg, -end, 1.0) if dtype is not None and dtype != tf.float32: fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype) bwd_pos_seq = tf.cast(bwd_pos_seq, dtype=dtype) if self.clamp_len > 0: fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len) bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len) if bsz is not None: # With bi_data, the batch size should be divisible by 2. assert bsz % 2 == 0 fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2) else: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1) else: fwd_pos_seq = tf.range(beg, end, -1.0) if dtype is not None and dtype != tf.float32: fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype) if self.clamp_len > 0: fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len) pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz) return pos_emb def call( self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask mems = inputs[2] if len(inputs) > 2 else mems perm_mask = inputs[3] if len(inputs) > 3 else perm_mask target_mapping = inputs[4] if len(inputs) > 4 else target_mapping token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids input_mask = inputs[6] if len(inputs) > 6 else input_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds use_cache = inputs[9] if len(inputs) > 9 else use_cache assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) mems = inputs.get("mems", mems) perm_mask = inputs.get("perm_mask", perm_mask) target_mapping = inputs.get("target_mapping", target_mapping) token_type_ids = inputs.get("token_type_ids", token_type_ids) input_mask = inputs.get("input_mask", input_mask) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = tf.transpose(input_ids, perm=(1, 0)) qlen, bsz = shape_list(input_ids)[:2] elif inputs_embeds is not None: inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2)) qlen, bsz = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32 # Attention mask # causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = attn_mask[:, :, None, None] elif self.attn_type == "bi": attn_mask = None else: raise ValueError("Unsupported attention type: {}".format(self.attn_type)) # data mask: input mask & perm mask assert input_mask is None or attention_mask is None, ( "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." ) if input_mask is None and attention_mask is not None: input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float) if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: data_mask = input_mask[None] elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # all mems can be attended to if mlen > 0: mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], dtype=dtype_float) data_mask = tf.concat([mems_mask, data_mask], axis=1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] else: attn_mask += data_mask[:, :, :, None] if attn_mask is not None: attn_mask = tf.cast(attn_mask > 0, dtype=dtype_float) if attn_mask is not None: non_tgt_mask = -tf.eye(qlen, dtype=dtype_float) if mlen > 0: non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=dtype_float), non_tgt_mask], axis=-1) non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=dtype_float) else: non_tgt_mask = None # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k, training=training) if target_mapping is not None: word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1]) # else: # We removed the inp_q input which was same as target mapping # inp_q_ext = inp_q[:, :, None] # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k output_g = self.dropout(word_emb_q, training=training) else: output_g = None # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32) cat_ids = tf.concat([mem_pad, token_type_ids], 0) else: cat_ids = token_type_ids # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = tf.cast(tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])), tf.int32) seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float) else: seg_mat = None # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float) pos_emb = self.dropout(pos_emb, training=training) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layer new_mems = () if mems is None: mems = [None] * len(self.layer) attentions = [] hidden_states = [] for i, layer_module in enumerate(self.layer): # cache new mems if self.mem_len is not None and self.mem_len > 0 and use_cache is True: new_mems = new_mems + (self.cache_mem(output_h, mems[i]),) if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) outputs = layer_module( [output_h, output_g, non_tgt_mask, attn_mask, pos_emb, seg_mat, mems[i], target_mapping, head_mask[i]], training=training, ) output_h, output_g = outputs[:2] if self.output_attentions: attentions.append(outputs[2]) # Add last hidden state if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h, training=training) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) outputs = (tf.transpose(output, perm=(1, 0, 2)),) if self.mem_len is not None and self.mem_len > 0 and use_cache is True: outputs = outputs + (new_mems,) if self.output_hidden_states: if output_g is not None: hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs) else: hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states) outputs = outputs + (hidden_states,) if self.output_attentions: attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) outputs = outputs + (attentions,) return outputs # outputs, (new_mems), (hidden_states), (attentions) class TFXLNetPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLNetConfig base_model_prefix = "transformer" XLNET_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.XLNetConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLNET_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.XLNetTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. perm_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``: If ``perm_mask[k, i, j] = 0``, i attend to j in batch k; if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k. If None, each token attends to all the others (full bidirectional attention). Only used during pretraining (to define factorization order) or for sequential decoding (generation). target_mapping (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the output tokens to use. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding (generation). token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ input_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding. Kept for compatibility with the original code base. You can only uses one of `input_mask` and `attention_mask` Mask values selected in ``[0, 1]``: ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. use_cache (:obj:`bool`): If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. """ @add_start_docstrings( "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.", XLNET_START_DOCSTRING, ) class TFXLNetModel(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetModel tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetModel.from_pretrained('xlnet-large-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs @add_start_docstrings( """XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLNET_START_DOCSTRING, ) class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss") def get_output_embeddings(self): return self.lm_loss.input_embeddings def prepare_inputs_for_generation(self, inputs, past, **kwargs): # Add dummy token at the end (no attention on this one) effective_batch_size = inputs.shape[0] dummy_token = tf.zeros((effective_batch_size, 1), dtype=tf.int32) inputs = tf.concat([inputs, dummy_token], axis=1) # Build permutation mask so that previous tokens don't see last token sequence_length = inputs.shape[1] perm_mask = tf.zeros((effective_batch_size, sequence_length, sequence_length - 1), dtype=tf.float32) perm_mask_seq_end = tf.ones((effective_batch_size, sequence_length, 1), dtype=tf.float32) perm_mask = tf.concat([perm_mask, perm_mask_seq_end], axis=-1) # We'll only predict the last token target_mapping = tf.zeros((effective_batch_size, 1, sequence_length - 1), dtype=tf.float32) target_mapping_seq_end = tf.ones((effective_batch_size, 1, 1), dtype=tf.float32) target_mapping = tf.concat([target_mapping, target_mapping_seq_end], axis=-1) inputs = { "inputs": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "use_cache": kwargs["use_cache"], } # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf import numpy as np from transformers1 import XLNetTokenizer, TFXLNetLMHeadModel tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased') # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = tf.constant(tokenizer.encode("Hello, my dog is very ", add_special_tokens=True))[None, :] # We will predict the masked token perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1])) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32)) next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_state = transformer_outputs[0] logits = self.lm_loss(hidden_state) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it return outputs # return logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLNET_START_DOCSTRING, ) class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary( config, initializer_range=config.initializer_range, name="sequence_summary" ) self.logits_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetForSequenceClassification tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it return outputs # return logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLNET_START_DOCSTRING, ) class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name="transformer") self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): Classification scores (before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetForTokenClassification tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] logits = self.classifier(output) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it return outputs # return logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLNET_START_DOCSTRING, ) class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] """ transformer_outputs = self.transformer(inputs, **kwargs) sequence_output = transformer_outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + transformer_outputs[ 1: ] # Keep mems, hidden states, attentions if there are in it return outputs # start_logits, end_logits, (mems), (hidden_states), (attentions) # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # the hidden-states output to compute `span start logits` and `span end logits`). """, # XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) # class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel): # r""" # Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: # **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)`` # Log probabilities for the top config.start_n_top start token possibilities (beam-search). # **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)`` # Indices for the top config.start_n_top start token possibilities (beam-search). # **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` # Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). # **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` # Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). # **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size,)`` # Log probabilities for the ``is_impossible`` label of the answers. # **mems**: # list of ``tf.Tensor`` (one for each layer): # that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model # if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. # See details in the docstring of the `mems` input above. # **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) # list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) # of shape ``(batch_size, sequence_length, hidden_size)``: # Hidden-states of the model at the output of each layer plus the initial embedding outputs. # **attentions**: (`optional`, returned when ``config.output_attentions=True``) # list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: # Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. # Examples:: # # For example purposes. Not runnable. # tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') # model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') # input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 # start_positions = tf.constant([1]) # end_positions = tf.constant([3]) # outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) # loss, start_scores, end_scores = outputs[:2] # """ # def __init__(self, config, *inputs, **kwargs): # super().__init__(config, *inputs, **kwargs) # self.start_n_top = config.start_n_top # self.end_n_top = config.end_n_top # self.transformer = TFXLNetMainLayer(config, name='transformer') # self.start_logits = TFPoolerStartLogits(config, name='start_logits') # self.end_logits = TFPoolerEndLogits(config, name='end_logits') # self.answer_class = TFPoolerAnswerClass(config, name='answer_class') # def call(self, inputs, training=False): # transformer_outputs = self.transformer(inputs, training=training) # hidden_states = transformer_outputs[0] # start_logits = self.start_logits(hidden_states, p_mask=p_mask) # outputs = transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it # if start_positions is not None and end_positions is not None: # # If we are on multi-GPU, let's remove the dimension added by batch splitting # for x in (start_positions, end_positions, cls_index, is_impossible): # if x is not None and x.dim() > 1: # x.squeeze_(-1) # # during training, compute the end logits based on the ground truth of the start position # end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) # loss_fct = CrossEntropyLoss() # start_loss = loss_fct(start_logits, start_positions) # end_loss = loss_fct(end_logits, end_positions) # total_loss = (start_loss + end_loss) / 2 # if cls_index is not None and is_impossible is not None: # # Predict answerability from the representation of CLS and START # cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) # loss_fct_cls = nn.BCEWithLogitsLoss() # cls_loss = loss_fct_cls(cls_logits, is_impossible) # # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss # total_loss += cls_loss * 0.5 # outputs = (total_loss,) + outputs # else: # # during inference, compute the end logits based on beam search # bsz, slen, hsz = hidden_states.size() # start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) # start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top) # start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) # start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) # start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) # hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz) # p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None # end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) # end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) # end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top) # end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) # end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) # start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) # get the representation of START as weighted sum of hidden states # cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) # Shape (batch size,): one single `cls_logits` for each sample # outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs # # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits # # or (if labels are provided) (total_loss,) # return outputs ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py """ import logging import torch import torch.nn as nn import torch.nn.functional as F from .configuration_transfo_xl import TransfoXLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax from .modeling_utils import PreTrainedModel logger = logging.getLogger(__name__) TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "transfo-xl-wt103", # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl ] def build_tf_to_pytorch_map(model, config): """ A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible. """ tf_to_pt_map = {} if hasattr(model, "transformer"): # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax tf_to_pt_map.update( { "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight, "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias, } ) for i, (out_l, proj_l, tie_proj) in enumerate( zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs) ): layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i if config.tie_weight: tf_to_pt_map.update({layer_str + "b": out_l.bias}) else: raise NotImplementedError # I don't think this is implemented in the TF code tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias}) if not tie_proj: tf_to_pt_map.update({layer_str + "proj": proj_l}) # Now load the rest of the transformer model = model.transformer # Embeddings for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)): layer_str = "transformer/adaptive_embed/cutoff_%d/" % i tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l}) # Transformer blocks for i, b in enumerate(model.layers): layer_str = "transformer/layer_%d/" % i tf_to_pt_map.update( { layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias, layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight, layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight, layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight, layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight, layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias, layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight, layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias, layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight, layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias, } ) # Relative positioning biases if config.untie_r: r_r_list = [] r_w_list = [] for b in model.layers: r_r_list.append(b.dec_attn.r_r_bias) r_w_list.append(b.dec_attn.r_w_bias) else: r_r_list = [model.r_r_bias] r_w_list = [model.r_w_bias] tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list}) return tf_to_pt_map def load_tf_weights_in_transfo_xl(model, config, tf_path): """ Load tf checkpoints in a pytorch model """ try: import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise # Build TF to PyTorch weights loading map tf_to_pt_map = build_tf_to_pytorch_map(model, config) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) tf_weights[name] = array for name, pointer in tf_to_pt_map.items(): assert name in tf_weights array = tf_weights[name] # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if "kernel" in name or "proj" in name: array = np.transpose(array) if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1: # Here we will split the TF weights assert len(pointer) == array.shape[0] for i, p_i in enumerate(pointer): arr_i = array[i, ...] try: assert p_i.shape == arr_i.shape except AssertionError as e: e.args += (p_i.shape, arr_i.shape) raise logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) p_i.data = torch.from_numpy(arr_i) else: try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) tf_weights.pop(name, None) tf_weights.pop(name + "/Adam", None) tf_weights.pop(name + "/Adam_1", None) logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) return model class PositionalEmbedding(nn.Module): def __init__(self, demb): super().__init__() self.demb = demb inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) self.register_buffer("inv_freq", inv_freq) def forward(self, pos_seq, bsz=None): sinusoid_inp = torch.ger(pos_seq, self.inv_freq) pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) if bsz is not None: return pos_emb[:, None, :].expand(-1, bsz, -1) else: return pos_emb[:, None, :] class PositionwiseFF(nn.Module): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5): super().__init__() self.d_model = d_model self.d_inner = d_inner self.dropout = dropout self.CoreNet = nn.Sequential( nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), nn.Dropout(dropout), nn.Linear(d_inner, d_model), nn.Dropout(dropout), ) self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) self.pre_lnorm = pre_lnorm def forward(self, inp): if self.pre_lnorm: # layer normalization + positionwise feed-forward core_out = self.CoreNet(self.layer_norm(inp)) # residual connection output = core_out + inp else: # positionwise feed-forward core_out = self.CoreNet(inp) # residual connection + layer normalization output = self.layer_norm(inp + core_out) return output class RelPartialLearnableMultiHeadAttn(nn.Module): def __init__( self, n_head, d_model, d_head, dropout, dropatt=0, tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False, layer_norm_epsilon=1e-5, ): super().__init__() self.output_attentions = output_attentions self.n_head = n_head self.d_model = d_model self.d_head = d_head self.dropout = dropout self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) self.drop = nn.Dropout(dropout) self.dropatt = nn.Dropout(dropatt) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) self.scale = 1 / (d_head ** 0.5) self.pre_lnorm = pre_lnorm if r_r_bias is None or r_w_bias is None: # Biases are not shared self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) else: self.r_r_bias = r_r_bias self.r_w_bias = r_w_bias self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) def _rel_shift(self, x): zero_pad_shape = (x.size(0), 1) + x.size()[2:] zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype) x_padded = torch.cat([zero_pad, x], dim=1) x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:] x_padded = x_padded.view(*x_padded_shape) x = x_padded[1:].view_as(x) return x def forward(self, w, r, attn_mask=None, mems=None, head_mask=None): qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) if mems is not None: cat = torch.cat([mems, w], 0) if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(cat)) else: w_heads = self.qkv_net(cat) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) w_head_q = w_head_q[-qlen:] else: if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(w)) else: w_heads = self.qkv_net(w) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) klen = w_head_k.size(0) w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head # compute attention score rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head rr_head_q = w_head_q + self.r_r_bias BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head BD = self._rel_shift(BD) # [qlen x klen x bsz x n_head] attn_score = AC + BD attn_score.mul_(self.scale) # compute attention probability if attn_mask is not None and torch.sum(attn_mask).item(): attn_mask = attn_mask == 1 # Switch to bool if attn_mask.dim() == 2: if next(self.parameters()).dtype == torch.float16: attn_score = ( attn_score.float().masked_fill(attn_mask[None, :, :, None], -65000).type_as(attn_score) ) else: attn_score = attn_score.float().masked_fill(attn_mask[None, :, :, None], -1e30).type_as(attn_score) elif attn_mask.dim() == 3: if next(self.parameters()).dtype == torch.float16: attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -65000).type_as(attn_score) else: attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score) # [qlen x klen x bsz x n_head] attn_prob = F.softmax(attn_score, dim=1) attn_prob = self.dropatt(attn_prob) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * head_mask # compute attention vector attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v)) # [qlen x bsz x n_head x d_head] attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) # linear projection attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out) if self.pre_lnorm: # residual connection outputs = [w + attn_out] else: # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] if self.output_attentions: outputs.append(attn_prob) return outputs class RelPartialLearnableDecoderLayer(nn.Module): def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs): super().__init__() self.dec_attn = RelPartialLearnableMultiHeadAttn( n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs ) self.pos_ff = PositionwiseFF( d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon ) def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None): attn_outputs = self.dec_attn(dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask) ff_output = self.pos_ff(attn_outputs[0]) outputs = [ff_output] + attn_outputs[1:] return outputs class AdaptiveEmbedding(nn.Module): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False): super().__init__() self.n_token = n_token self.d_embed = d_embed self.cutoffs = cutoffs + [n_token] self.div_val = div_val self.d_proj = d_proj self.emb_scale = d_proj ** 0.5 self.cutoff_ends = [0] + self.cutoffs self.emb_layers = nn.ModuleList() self.emb_projs = nn.ParameterList() if div_val == 1: self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0)) if d_proj != d_embed: self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i)) self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) def forward(self, inp): if self.div_val == 1: embed = self.emb_layers[0](inp) if self.d_proj != self.d_embed: embed = F.linear(embed, self.emb_projs[0]) else: param = next(self.parameters()) inp_flat = inp.view(-1) emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) indices_i = mask_i.nonzero().squeeze() if indices_i.numel() == 0: continue inp_i = inp_flat.index_select(0, indices_i) - l_idx emb_i = self.emb_layers[i](inp_i) emb_i = F.linear(emb_i, self.emb_projs[i]) emb_flat.index_copy_(0, indices_i, emb_i) embed_shape = inp.size() + (self.d_proj,) embed = emb_flat.view(embed_shape) embed.mul_(self.emb_scale) return embed class TransfoXLPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = TransfoXLConfig load_tf_weights = load_tf_weights_in_transfo_xl base_model_prefix = "transformer" def _init_weight(self, weight): if self.config.init == "uniform": nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) elif self.config.init == "normal": nn.init.normal_(weight, 0.0, self.config.init_std) def _init_bias(self, bias): nn.init.constant_(bias, 0.0) def _init_weights(self, m): """ Initialize the weights. """ classname = m.__class__.__name__ if classname.find("Linear") != -1: if hasattr(m, "weight") and m.weight is not None: self._init_weight(m.weight) if hasattr(m, "bias") and m.bias is not None: self._init_bias(m.bias) elif classname.find("AdaptiveEmbedding") != -1: if hasattr(m, "emb_projs"): for i in range(len(m.emb_projs)): if m.emb_projs[i] is not None: nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std) elif classname.find("Embedding") != -1: if hasattr(m, "weight"): self._init_weight(m.weight) elif classname.find("ProjectedAdaptiveLogSoftmax") != -1: if hasattr(m, "cluster_weight") and m.cluster_weight is not None: self._init_weight(m.cluster_weight) if hasattr(m, "cluster_bias") and m.cluster_bias is not None: self._init_bias(m.cluster_bias) if hasattr(m, "out_projs"): for i in range(len(m.out_projs)): if m.out_projs[i] is not None: nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std) elif classname.find("LayerNorm") != -1: if hasattr(m, "weight"): nn.init.normal_(m.weight, 1.0, self.config.init_std) if hasattr(m, "bias") and m.bias is not None: self._init_bias(m.bias) else: if hasattr(m, "r_emb"): self._init_weight(m.r_emb) if hasattr(m, "r_w_bias"): self._init_weight(m.r_w_bias) if hasattr(m, "r_r_bias"): self._init_weight(m.r_r_bias) if hasattr(m, "r_bias"): self._init_bias(m.r_bias) TRANSFO_XL_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.TransfoXLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ TRANSFO_XL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.TransfoXLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", TRANSFO_XL_START_DOCSTRING, ) class TransfoXLModel(TransfoXLPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.n_token = config.vocab_size self.d_embed = config.d_embed self.d_model = config.d_model self.n_head = config.n_head self.d_head = config.d_head self.word_emb = AdaptiveEmbedding( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val ) self.drop = nn.Dropout(config.dropout) self.n_layer = config.n_layer self.tgt_len = config.tgt_len self.mem_len = config.mem_len self.ext_len = config.ext_len self.max_klen = config.tgt_len + config.ext_len + config.mem_len self.attn_type = config.attn_type if not config.untie_r: self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.layers = nn.ModuleList() if config.attn_type == 0: # the default attention for i in range(config.n_layer): self.layers.append( RelPartialLearnableDecoderLayer( config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, r_w_bias=None if config.untie_r else self.r_w_bias, r_r_bias=None if config.untie_r else self.r_r_bias, output_attentions=self.output_attentions, layer_norm_epsilon=config.layer_norm_epsilon, ) ) else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints raise NotImplementedError # Removed them to avoid maintaining dead code self.same_length = config.same_length self.clamp_len = config.clamp_len if self.attn_type == 0: # default attention self.pos_emb = PositionalEmbedding(self.d_model) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint self.init_weights() def get_input_embeddings(self): return self.word_emb def set_input_embeddings(self, new_embeddings): self.word_emb = new_embeddings def backward_compatible(self): self.sample_softmax = -1 def reset_length(self, tgt_len, ext_len, mem_len): self.tgt_len = tgt_len self.mem_len = mem_len self.ext_len = ext_len def _prune_heads(self, heads): logger.info("Head pruning is not implemented for Transformer-XL model") pass def init_mems(self, bsz): if self.mem_len > 0: mems = [] param = next(self.parameters()) for i in range(self.n_layer): empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device) mems.append(empty) return mems else: return None def _update_mems(self, hids, mems, mlen, qlen): # does not deal with None if mems is None: return None # mems is not None assert len(hids) == len(mems), "len(hids) != len(mems)" # There are `mlen + qlen` steps that can be cached into mems # For the next step, the last `ext_len` of the `qlen` tokens # will be used as the extended context. Hence, we only cache # the tokens from `mlen + qlen - self.ext_len - self.mem_len` # to `mlen + qlen - self.ext_len`. with torch.no_grad(): new_mems = [] end_idx = mlen + max(0, qlen - 0 - self.ext_len) beg_idx = max(0, end_idx - self.mem_len) for i in range(len(hids)): cat = torch.cat([mems[i], hids[i]], dim=0) new_mems.append(cat[beg_idx:end_idx].detach()) return new_mems @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import TransfoXLTokenizer, TransfoXLModel import torch tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLModel.from_pretrained('transfo-xl-wt103') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states, mems = outputs[:2] """ # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = input_ids.transpose(0, 1).contiguous() qlen, bsz = input_ids.size() elif inputs_embeds is not None: inputs_embeds = inputs_embeds.transpose(0, 1).contiguous() qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if mems is None: mems = self.init_mems(bsz) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0) head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) head_mask = head_mask.to( dtype=next(self.parameters()).dtype ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer if inputs_embeds is not None: word_emb = inputs_embeds else: word_emb = self.word_emb(input_ids) mlen = mems[0].size(0) if mems is not None else 0 klen = mlen + qlen if self.same_length: all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) mask_len = klen - self.mem_len if mask_len > 0: mask_shift_len = qlen - mask_len else: mask_shift_len = qlen dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 else: dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[ :, :, None ] hids = [] attentions = [] if self.attn_type == 0: # default pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype) if self.clamp_len > 0: pos_seq.clamp_(max=self.clamp_len) pos_emb = self.pos_emb(pos_seq) core_out = self.drop(word_emb) pos_emb = self.drop(pos_emb) for i, layer in enumerate(self.layers): hids.append(core_out) mems_i = None if mems is None else mems[i] layer_outputs = layer( core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i, head_mask=head_mask[i] ) core_out = layer_outputs[0] if self.output_attentions: attentions.append(layer_outputs[1]) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint core_out = self.drop(core_out) new_mems = self._update_mems(hids, mems, mlen, qlen) # We transpose back here to shape [bsz, len, hidden_dim] outputs = [core_out.transpose(0, 1).contiguous(), new_mems] if self.output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) hids = list(t.transpose(0, 1).contiguous() for t in hids) outputs.append(hids) if self.output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions) outputs.append(attentions) return outputs # last hidden state, new_mems, (all hidden states), (all attentions) @add_start_docstrings( """The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive input embeddings)""", TRANSFO_XL_START_DOCSTRING, ) class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = TransfoXLModel(config) self.sample_softmax = config.sample_softmax assert ( self.sample_softmax <= 0 ), "Sampling from the softmax is not implemented yet. Please look at issue: #3310: https://github.com/huggingface/transformers/issues/3310" self.crit = ProjectedAdaptiveLogSoftmax( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val ) self.init_weights() def tie_weights(self): """ Run this to be sure output and input (adaptive) softmax weights are tied """ if self.config.tie_weight: for i in range(len(self.crit.out_layers)): self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i]) if self.config.tie_projs: for i, tie_proj in enumerate(self.config.tie_projs): if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: if self.config.torchscript: self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone()) else: self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] elif tie_proj and self.config.div_val != 1: if self.config.torchscript: self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone()) else: self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] def reset_length(self, tgt_len, ext_len, mem_len): self.transformer.reset_length(tgt_len, ext_len, mem_len) def init_mems(self, bsz): return self.transformer.init_mems(bsz) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, labels=None): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import TransfoXLTokenizer, TransfoXLLMHeadModel import torch tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, mems = outputs[:2] """ if input_ids is not None: bsz, tgt_len = input_ids.size(0), input_ids.size(1) elif inputs_embeds is not None: bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1) else: raise ValueError("You have to specify either input_ids or inputs_embeds") transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask, inputs_embeds=inputs_embeds) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] outputs = transformer_outputs[1:] softmax_output = self.crit(pred_hid, labels) if labels is None: softmax_output = softmax_output.view(bsz, tgt_len, -1) outputs = [softmax_output] + outputs else: softmax_output = softmax_output.view(bsz, tgt_len - 1) outputs = [softmax_output, None] + outputs return outputs # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions) def get_output_embeddings(self): """ Double-check if you are using adaptive softmax. """ if self.sample_softmax > 0: return self.out_layer else: return self.crit.out_layers[-1] def prepare_inputs_for_generation(self, input_ids, past, **model_kwargs): inputs = {"input_ids": input_ids} # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_transfo_xl_utilities.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl. """ import torch import torch.nn as nn import torch.nn.functional as F # CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) # CUDA_MINOR = int(torch.version.cuda.split('.')[1]) class ProjectedAdaptiveLogSoftmax(nn.Module): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False): super().__init__() self.n_token = n_token self.d_embed = d_embed self.d_proj = d_proj self.cutoffs = cutoffs + [n_token] self.cutoff_ends = [0] + self.cutoffs self.div_val = div_val self.shortlist_size = self.cutoffs[0] self.n_clusters = len(self.cutoffs) - 1 self.head_size = self.shortlist_size + self.n_clusters if self.n_clusters > 0: self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) self.out_layers = nn.ModuleList() self.out_projs = nn.ParameterList() if div_val == 1: for i in range(len(self.cutoffs)): if d_proj != d_embed: self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) else: self.out_projs.append(None) self.out_layers.append(nn.Linear(d_embed, n_token)) else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx)) self.keep_order = keep_order def _compute_logit(self, hidden, weight, bias, proj): if proj is None: logit = F.linear(hidden, weight, bias=bias) else: # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: proj_hid = F.linear(hidden, proj.t().contiguous()) logit = F.linear(proj_hid, weight, bias=bias) # else: # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) # if bias is not None: # logit = logit + bias return logit def forward(self, hidden, labels=None, keep_order=False): """ Params: hidden :: [len*bsz x d_proj] labels :: [len*bsz] Return: if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out :: [(len-1)*bsz] Negative log likelihood We could replace this implementation by the native PyTorch one if their's had an option to set bias on all clusters in the native one. here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138 """ if labels is not None: # Shift so that tokens < n predict n hidden = hidden[..., :-1, :].contiguous() labels = labels[..., 1:].contiguous() hidden = hidden.view(-1, hidden.size(-1)) labels = labels.view(-1) if hidden.size(0) != labels.size(0): raise RuntimeError("Input and labels should have the same size " "in the batch dimension.") else: hidden = hidden.view(-1, hidden.size(-1)) if self.n_clusters == 0: logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) if labels is not None: out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1) else: out = F.log_softmax(logit, dim=-1) else: # construct weights and biases weights, biases = [], [] for i in range(len(self.cutoffs)): if self.div_val == 1: l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] weight_i = self.out_layers[0].weight[l_idx:r_idx] bias_i = self.out_layers[0].bias[l_idx:r_idx] else: weight_i = self.out_layers[i].weight bias_i = self.out_layers[i].bias if i == 0: weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) weights.append(weight_i) biases.append(bias_i) head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) head_logprob = F.log_softmax(head_logit, dim=1) if labels is None: out = hidden.new_empty((head_logit.size(0), self.n_token)) else: out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device) offset = 0 cutoff_values = [0] + self.cutoffs for i in range(len(cutoff_values) - 1): l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] if labels is not None: mask_i = (labels >= l_idx) & (labels < r_idx) indices_i = mask_i.nonzero().squeeze() if indices_i.numel() == 0: continue target_i = labels.index_select(0, indices_i) - l_idx head_logprob_i = head_logprob.index_select(0, indices_i) hidden_i = hidden.index_select(0, indices_i) else: hidden_i = hidden if i == 0: if labels is not None: logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1) else: out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] else: weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster if labels is not None: logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather( 1, target_i[:, None] ).squeeze(1) else: logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i out[:, l_idx:r_idx] = logprob_i if labels is not None: if (hasattr(self, "keep_order") and self.keep_order) or keep_order: out.index_copy_(0, indices_i, -logprob_i) else: out[offset : offset + logprob_i.size(0)].copy_(-logprob_i) offset += logprob_i.size(0) return out def log_prob(self, hidden): r""" Computes log probabilities for all :math:`n\_classes` From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py Args: hidden (Tensor): a minibatch of examples Returns: log-probabilities of for each class :math:`c` in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. Shape: - Input: :math:`(N, in\_features)` - Output: :math:`(N, n\_classes)` """ if self.n_clusters == 0: logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) return F.log_softmax(logit, dim=-1) else: # construct weights and biases weights, biases = [], [] for i in range(len(self.cutoffs)): if self.div_val == 1: l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] weight_i = self.out_layers[0].weight[l_idx:r_idx] bias_i = self.out_layers[0].bias[l_idx:r_idx] else: weight_i = self.out_layers[i].weight bias_i = self.out_layers[i].bias if i == 0: weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) weights.append(weight_i) biases.append(bias_i) head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) out = hidden.new_empty((head_logit.size(0), self.n_token)) head_logprob = F.log_softmax(head_logit, dim=1) cutoff_values = [0] + self.cutoffs for i in range(len(cutoff_values) - 1): start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1] if i == 0: out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] else: weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i) tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) logprob_i = head_logprob[:, -i] + tail_logprob_i out[:, start_idx, stop_idx] = logprob_i return out ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect import logging import os from typing import Callable, Dict, Iterable, List, Optional, Tuple import torch from torch import Tensor, device, dtype, nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F from .activations import get_activation from .configuration_utils import PretrainedConfig from .file_utils import ( DUMMY_INPUTS, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url, ) logger = logging.getLogger(__name__) try: from torch.nn import Identity except ImportError: # Older PyTorch compatibility class Identity(nn.Module): r"""A placeholder identity operator that is argument-insensitive. """ def __init__(self, *args, **kwargs): super().__init__() def forward(self, input): return input class ModuleUtilsMixin: """ A few utilities for torch.nn.Modules, to be used as a mixin. """ def num_parameters(self, only_trainable: bool = False) -> int: """ Get number of (optionally, trainable) parameters in the module. """ params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters() return sum(p.numel() for p in params) @staticmethod def _hook_rss_memory_pre_forward(module, *args, **kwargs): try: import psutil except (ImportError): raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") process = psutil.Process(os.getpid()) mem = process.memory_info() module.mem_rss_pre_forward = mem.rss return None @staticmethod def _hook_rss_memory_post_forward(module, *args, **kwargs): try: import psutil except (ImportError): raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") process = psutil.Process(os.getpid()) mem = process.memory_info() module.mem_rss_post_forward = mem.rss mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0) return None def add_memory_hooks(self): """ Add a memory hook before and after each sub-module forward pass to record increase in memory consumption. Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()` """ for module in self.modules(): module.register_forward_pre_hook(self._hook_rss_memory_pre_forward) module.register_forward_hook(self._hook_rss_memory_post_forward) self.reset_memory_hooks_state() def reset_memory_hooks_state(self): for module in self.modules(): module.mem_rss_diff = 0 module.mem_rss_post_forward = 0 module.mem_rss_pre_forward = 0 @property def device(self) -> device: """ Get torch.device from module, assuming that the whole module has one device. """ try: return next(self.parameters()).device except StopIteration: # For nn.DataParallel compatibility in PyTorch 1.5 def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] return tuples gen = self._named_members(get_members_fn=find_tensor_attributes) first_tuple = next(gen) return first_tuple[1].device @property def dtype(self) -> dtype: """ Get torch.dtype from module, assuming that the whole module has one dtype. """ try: return next(self.parameters()).dtype except StopIteration: # For nn.DataParallel compatibility in PyTorch 1.5 def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] return tuples gen = self._named_members(get_members_fn=find_tensor_attributes) first_tuple = next(gen) return first_tuple[1].dtype def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: """type: torch.Tensor -> torch.Tensor""" if encoder_attention_mask.dim() == 3: encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow # /transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = (encoder_extended_attention_mask == # encoder_extended_attention_mask.transpose(-1, -2)) encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility if self.dtype == torch.float16: encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 elif self.dtype == torch.float32: encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: raise ValueError( "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format( self.dtype ) ) return encoder_extended_attention_mask def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple, device: device) -> Tensor: """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored. Arguments: attention_mask: torch.Tensor with 1 indicating tokens to ATTEND to input_shape: tuple, shape of input_ids device: torch.Device, usually self.device Returns: torch.Tensor with dtype of attention_mask.dtype """ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder: batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.to(attention_mask.dtype) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( input_shape, attention_mask.shape ) ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def get_head_mask(self, head_mask: Tensor, num_hidden_layers: int, is_attention_chunked: bool = False) -> Tensor: """ # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head attention_probs has shape bsz x n_heads x N x N Arguments: head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads] num_hidden_layers: int Returns: Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] or list with [None] for each layer """ if head_mask is not None: head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) else: head_mask = [None] * num_hidden_layers return head_mask def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" head_mask = head_mask.to(dtype=self.dtype) # switch to fload if need + fp16 compatibility return head_mask class PreTrainedModel(nn.Module, ModuleUtilsMixin): r""" Base class for all models. :class:`~transformers1.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. Class attributes (overridden by derived classes): - ``config_class``: a class derived from :class:`~transformers1.PretrainedConfig` to use as configuration class for this model architecture. - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: - ``model``: an instance of the relevant subclass of :class:`~transformers1.PreTrainedModel`, - ``config``: an instance of the relevant subclass of :class:`~transformers1.PretrainedConfig`, - ``path``: a path (string) to the TensorFlow checkpoint. - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. """ config_class = None base_model_prefix = "" @property def dummy_inputs(self): """ Dummy inputs to do a forward pass in the network. Returns: torch.Tensor with dummy inputs """ return {"input_ids": torch.tensor(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): super().__init__() if not isinstance(config, PretrainedConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " "To create a model from a pretrained model use " "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ ) ) # Save config in model self.config = config @property def base_model(self): return getattr(self, self.base_model_prefix, self) def get_input_embeddings(self): """ Returns the model's input embeddings. Returns: :obj:`nn.Module`: A torch module mapping vocabulary to hidden states. """ base_model = getattr(self, self.base_model_prefix, self) if base_model is not self: return base_model.get_input_embeddings() else: raise NotImplementedError def set_input_embeddings(self, value: nn.Module): """ Set model's input embeddings Args: value (:obj:`nn.Module`): A module mapping vocabulary to hidden states. """ base_model = getattr(self, self.base_model_prefix, self) if base_model is not self: base_model.set_input_embeddings(value) else: raise NotImplementedError def get_output_embeddings(self): """ Returns the model's output embeddings. Returns: :obj:`nn.Module`: A torch module mapping hidden states to vocabulary. """ return None # Overwrite for models with output embeddings def tie_weights(self): """ Tie the weights between the input embeddings and the output embeddings. If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the weights instead. """ output_embeddings = self.get_output_embeddings() if output_embeddings is not None: self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) def _tie_or_clone_weights(self, output_embeddings, input_embeddings): """ Tie or clone module weights depending of whether we are using TorchScript or not """ if self.config.torchscript: output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone()) else: output_embeddings.weight = input_embeddings.weight if getattr(output_embeddings, "bias", None) is not None: output_embeddings.bias.data = torch.nn.functional.pad( output_embeddings.bias.data, (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],), "constant", 0, ) if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): output_embeddings.out_features = input_embeddings.num_embeddings def resize_token_embeddings(self, new_num_tokens: Optional[int] = None): """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Arguments: new_num_tokens: (`optional`) int: New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model. Return: ``torch.nn.Embeddings`` Pointer to the input tokens Embeddings Module of the model """ base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed model_embeds = base_model._resize_token_embeddings(new_num_tokens) if new_num_tokens is None: return model_embeds # Update base model and current model config self.config.vocab_size = new_num_tokens base_model.vocab_size = new_num_tokens # Tie weights again if needed self.tie_weights() return model_embeds def _resize_token_embeddings(self, new_num_tokens): old_embeddings = self.get_input_embeddings() new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) self.set_input_embeddings(new_embeddings) return self.get_input_embeddings() def _get_resized_embeddings( self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None ) -> torch.nn.Embedding: """ Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end Args: old_embeddings: ``torch.nn.Embedding`` Old embeddings to be resized. new_num_tokens: (`optional`) int New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end If not provided or None: return the provided token Embedding Module. Return: ``torch.nn.Embedding`` Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None """ if new_num_tokens is None: return old_embeddings old_num_tokens, old_embedding_dim = old_embeddings.weight.size() if old_num_tokens == new_num_tokens: return old_embeddings # Build new embeddings new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) new_embeddings.to(old_embeddings.weight.device) # initialize all new embeddings (in particular added tokens) self._init_weights(new_embeddings) # Copy token embeddings from the previous weights num_tokens_to_copy = min(old_num_tokens, new_num_tokens) new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] return new_embeddings def init_weights(self): """ Initialize and prunes weights if needed. """ # Initialize weights self.apply(self._init_weights) # Prune heads if needed if self.config.pruned_heads: self.prune_heads(self.config.pruned_heads) # Tie weights if needed self.tie_weights() def prune_heads(self, heads_to_prune: Dict): """ Prunes heads of the base model. Arguments: heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. """ # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads for layer, heads in heads_to_prune.items(): union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON self.base_model._prune_heads(heads_to_prune) def save_pretrained(self, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers1.PreTrainedModel.from_pretrained`` class method. Arguments: save_directory: directory to which to save. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # Only save the model itself if we are using distributed training model_to_save = self.module if hasattr(self, "module") else self # Attach architecture to the config model_to_save.config.architectures = [model_to_save.__class__.__name__] # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) if getattr(self.config, "xla_device", False): import torch_xla.core.xla_model as xm if xm.is_master_ordinal(): # Save configuration file model_to_save.config.save_pretrained(save_directory) # xm.save takes care of saving only from master xm.save(model_to_save.state_dict(), output_model_file) else: model_to_save.config.save_pretrained(save_directory) torch.save(model_to_save.state_dict(), output_model_file) logger.info("Model weights saved in {}".format(output_model_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with ``model.train()`` The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) one of: - an instance of a class derived from :class:`~transformers1.PretrainedConfig`, or - a string valid as input to :func:`~transformers1.PretrainedConfig.from_pretrained()` Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: # For example purposes. Not runnable. model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) state_dict = kwargs.pop("state_dict", None) cache_dir = kwargs.pop("cache_dir", None) from_tf = kwargs.pop("from_tf", False) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) local_files_only = kwargs.pop("local_files_only", False) use_cdn = kwargs.pop("use_cdn", True) # Load config if we don't provide a configuration if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( config_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, **kwargs, ) else: model_kwargs = kwargs # Load model if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")): # Load from a TF 1.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): # Load from a TF 2.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: raise EnvironmentError( "Error no file named {} found in directory {} or `from_tf` set to False".format( [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], pretrained_model_name_or_path, ) ) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): assert ( from_tf ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( pretrained_model_name_or_path + ".index" ) archive_file = pretrained_model_name_or_path + ".index" else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME), use_cdn=use_cdn, ) try: # Load from URL or cache if already cached resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) if resolved_archive_file is None: raise EnvironmentError except EnvironmentError: msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) else: resolved_archive_file = None # Instantiate model. model = cls(config, *model_args, **model_kwargs) if state_dict is None and not from_tf: try: state_dict = torch.load(resolved_archive_file, map_location="cpu") except Exception: raise OSError( "Unable to load weights from pytorch checkpoint file. " "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. " ) missing_keys = [] unexpected_keys = [] error_msgs = [] if from_tf: if resolved_archive_file.endswith(".index"): # Load from a TensorFlow 1.X checkpoint - provided by original authors model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index' else: # Load from our TensorFlow 2.0 checkpoints try: from transformers import load_tf2_checkpoint_in_pytorch_model model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True) except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise else: # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if "gamma" in key: new_key = key.replace("gamma", "weight") if "beta" in key: new_key = key.replace("beta", "bias") if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants # so we need to apply the function recursively. def load(module: nn.Module, prefix=""): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs, ) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") # Make sure we are able to load base models as well as derived models (with heads) start_prefix = "" model_to_load = model has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()) if not hasattr(model, cls.base_model_prefix) and has_prefix_module: start_prefix = cls.base_model_prefix + "." if hasattr(model, cls.base_model_prefix) and not has_prefix_module: model_to_load = getattr(model, cls.base_model_prefix) load(model_to_load, prefix=start_prefix) if model.__class__.__name__ != model_to_load.__class__.__name__: base_model_state_dict = model_to_load.state_dict().keys() head_model_state_dict_without_base_prefix = [ key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys() ] missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, missing_keys ) ) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys ) ) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading state_dict for {}:\n\t{}".format( model.__class__.__name__, "\n\t".join(error_msgs) ) ) model.tie_weights() # make sure token embedding weights are still tied if needed # Set model in evaluation mode to deactivate DropOut modules by default model.eval() if output_loading_info: loading_info = { "missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs, } return model, loading_info if hasattr(config, "xla_device") and config.xla_device: import torch_xla.core.xla_model as xm model = xm.send_cpu_data_to_device(model, xm.xla_device()) model.to(xm.xla_device()) return model def prepare_inputs_for_generation(self, input_ids, **kwargs): return {"input_ids": input_ids} def prepare_logits_for_generation(self, logits, **kwargs): return logits def _use_cache(self, outputs, use_cache): """During generation, decide whether to pass the `past` variable to the next forward pass.""" if len(outputs) <= 1 or use_cache is False: return False if hasattr(self.config, "mem_len") and self.config.mem_len == 0: return False return True def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty): """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """ for i in range(batch_size * num_beams): for previous_token in set(prev_output_tokens[i].tolist()): # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability if lprobs[i, previous_token] < 0: lprobs[i, previous_token] *= repetition_penalty else: lprobs[i, previous_token] /= repetition_penalty @torch.no_grad() def generate( self, input_ids: Optional[torch.LongTensor] = None, max_length: Optional[int] = None, min_length: Optional[int] = None, do_sample: Optional[bool] = None, early_stopping: Optional[bool] = None, num_beams: Optional[int] = None, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None, bad_words_ids: Optional[Iterable[int]] = None, bos_token_id: Optional[int] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[int] = None, length_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, num_return_sequences: Optional[int] = None, attention_mask: Optional[torch.LongTensor] = None, decoder_start_token_id: Optional[int] = None, use_cache: Optional[bool] = None, **model_specific_kwargs ) -> torch.LongTensor: r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. Adapted in part from `Facebook's XLM beam search code`_. .. _`Facebook's XLM beam search code`: https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 Parameters: input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)` The sequence used as a prompt for the generation. If `None` the method initializes it as an empty `torch.LongTensor` of shape `(1,)`. max_length: (`optional`) int The max length of the sequence to be generated. Between `min_length` and infinity. Default to 20. min_length: (`optional`) int The min length of the sequence to be generated. Between 0 and infinity. Default to 0. do_sample: (`optional`) bool If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. early_stopping: (`optional`) bool if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. num_beams: (`optional`) int Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. temperature: (`optional`) float The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. top_k: (`optional`) int The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. top_p: (`optional`) float The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. repetition_penalty: (`optional`) float The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. pad_token_id: (`optional`) int Padding token. Default to specicic model pad_token_id or None if it does not exist. bos_token_id: (`optional`) int BOS token. Defaults to `bos_token_id` as defined in the models config. eos_token_id: (`optional`) int EOS token. Defaults to `eos_token_id` as defined in the models config. length_penalty: (`optional`) float Exponential penalty to the length. Default to 1. no_repeat_ngram_size: (`optional`) int If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. bad_words_ids: (`optional`) list of lists of int `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. num_return_sequences: (`optional`) int The number of independently computed returned sequences for each element in the batch. Default to 1. attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids` Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. Defaults to `None`. `What are attention masks? <../glossary.html#attention-mask>`__ decoder_start_token_id=None: (`optional`) int If an encoder-decoder model starts decoding with a different token than BOS. Defaults to `None` and is changed to `BOS` later. use_cache: (`optional`) bool If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. model_specific_kwargs: (`optional`) dict Additional model specific kwargs will be forwarded to the `forward` function of the model. Return: output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)` sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` Examples:: tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. outputs = model.generate(max_length=40) # do greedy decoding print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated """ # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError( "You tried to generate sequences with a model that does not have a LM Head." "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" ) max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping use_cache = use_cache if use_cache is not None else self.config.use_cache num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty no_repeat_ngram_size = ( no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size ) bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids num_return_sequences = ( num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences ) decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) if input_ids is not None: batch_size = input_ids.shape[0] # overriden by the input batch_size else: batch_size = 1 assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(use_cache, bool), "`use_cache` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." assert temperature > 0, "`temperature` should be strictly positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0) ), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0) ), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictly positive." assert ( isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 ), "`no_repeat_ngram_size` should be a positive integer." assert ( isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictly positive integer." assert ( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" if input_ids is None: assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) input_ids = torch.full( (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device, ) else: assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids): attention_mask = input_ids.ne(pad_token_id).long() elif attention_mask is None: attention_mask = input_ids.new_ones(input_ids.shape) # set pad_token_id to eos_token_id if not set. Important that this is done after # attention_mask is created if pad_token_id is None and eos_token_id is not None: logger.warning( "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) ) pad_token_id = eos_token_id # current position and vocab size if hasattr(self.config, "vocab_size"): vocab_size = self.config.vocab_size elif ( self.config.is_encoder_decoder and hasattr(self.config, "decoder") and hasattr(self.config.decoder, "vocab_size") ): vocab_size = self.config.decoder.vocab_size # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if self.config.is_encoder_decoder: if decoder_start_token_id is None: decoder_start_token_id = bos_token_id assert ( decoder_start_token_id is not None ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) # get encoder and store encoder outputs encoder = self.get_encoder() encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask) # Expand input ids if num_beams > 1 or num_return_sequences > 1 if num_return_sequences > 1 or num_beams > 1: input_ids_len = input_ids.shape[-1] input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len) attention_mask = attention_mask.unsqueeze(1).expand( batch_size, effective_batch_mult * num_beams, input_ids_len ) input_ids = input_ids.contiguous().view( effective_batch_size * num_beams, input_ids_len ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) attention_mask = attention_mask.contiguous().view( effective_batch_size * num_beams, input_ids_len ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) if self.config.is_encoder_decoder: # create empty decoder_input_ids input_ids = torch.full( (effective_batch_size * num_beams, 1), decoder_start_token_id, dtype=torch.long, device=next(self.parameters()).device, ) cur_len = 1 assert ( batch_size == encoder_outputs[0].shape[0] ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) expanded_batch_idxs = ( torch.arange(batch_size) .view(-1, 1) .repeat(1, num_beams * effective_batch_mult) .view(-1) .to(input_ids.device) ) # expand encoder_outputs encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:]) else: encoder_outputs = None cur_len = input_ids.shape[-1] if num_beams > 1: output = self._generate_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, decoder_start_token_id=decoder_start_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, num_return_sequences=num_return_sequences, length_penalty=length_penalty, num_beams=num_beams, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, model_specific_kwargs=model_specific_kwargs, ) else: output = self._generate_no_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, decoder_start_token_id=decoder_start_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, model_specific_kwargs=model_specific_kwargs, ) return output def _generate_no_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, decoder_start_token_id, batch_size, encoder_outputs, attention_mask, use_cache, model_specific_kwargs, ): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ # length of generated sentences / unfinished sentences unfinished_sents = input_ids.new(batch_size).fill_(1) sent_lengths = input_ids.new(batch_size).fill_(max_length) past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs ) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: self.enforce_repetition_penalty_(next_token_logits, batch_size, 1, input_ids, repetition_penalty) if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) for batch_idx in range(batch_size): next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) for batch_idx in range(batch_size): next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: next_token_logits[:, eos_token_id] = -float("inf") if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature # Top-p/top-k filtering next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) # Sample probs = F.softmax(next_token_logits, dim=-1) next_token = torch.multinomial(probs, num_samples=1).squeeze(1) else: # Greedy decoding next_token = torch.argmax(next_token_logits, dim=-1) # update generations and finished sentences if eos_token_id is not None: # pad finished sentences if eos_token_id exist tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) else: tokens_to_add = next_token # add token and increase length by one input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) cur_len = cur_len + 1 if eos_token_id is not None: eos_in_sents = tokens_to_add == eos_token_id # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len) # unfinished_sents is set to zero if eos in sentence unfinished_sents.mul_((~eos_in_sents).long()) # stop when there is a in each sentence, or if we exceed the maximul length if unfinished_sents.max() == 0: break # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = torch.cat( [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 ) # if there are different sentences lengths in the batch, some batches have to be padded if sent_lengths.min().item() != sent_lengths.max().item(): assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" # finished sents are filled with pad_token decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_(pad_token_id) else: decoded = input_ids for hypo_idx, hypo in enumerate(input_ids): decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]] return decoded def _generate_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, early_stopping, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, decoder_start_token_id, batch_size, num_return_sequences, length_penalty, num_beams, vocab_size, encoder_outputs, attention_mask, use_cache, model_specific_kwargs, ): """ Generate sequences for each example with beam search. """ # generated hypotheses generated_hyps = [ BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) for _ in range(batch_size) ] # scores for each sentence in the beam beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times if do_sample is False: beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) # cache compute states past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models # done sentences done = [False for _ in range(batch_size)] while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs ) outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: self.enforce_repetition_penalty_( next_token_logits, batch_size, num_beams, input_ids, repetition_penalty, ) if temperature != 1.0: next_token_logits = next_token_logits / temperature if self.config.is_encoder_decoder and do_sample is False: # TODO (PVP) still a bit hacky here - there might be a better solution next_token_logits = self.prepare_logits_for_generation( next_token_logits, cur_len=cur_len, max_length=max_length ) scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: scores[:, eos_token_id] = -float("inf") if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams num_batch_hypotheses = batch_size * num_beams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_batch_tokens = calc_banned_ngram_tokens( input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len ) for i, banned_tokens in enumerate(banned_batch_tokens): scores[i, banned_tokens] = -float("inf") if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) for i, banned_tokens in enumerate(banned_tokens): scores[i, banned_tokens] = -float("inf") assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format( scores.shape, (batch_size * num_beams, vocab_size) ) if do_sample: _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) # Top-p/top-k filtering _scores = top_k_top_p_filtering( _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 ) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together to sample from all beam_idxs _scores = _scores.contiguous().view( batch_size, num_beams * vocab_size ) # (batch_size, num_beams * vocab_size) # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) probs = F.softmax(_scores, dim=-1) next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) # (batch_size, num_beams * 2) # Compute next scores next_scores = torch.gather(_scores, -1, next_tokens) # (batch_size, num_beams * 2) # sort the sampled vector to make sure that the first num_beams samples are the best next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1) next_tokens = torch.gather(next_tokens, -1, next_scores_indices) # (batch_size, num_beams * 2) else: next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) next_scores = next_scores.view( batch_size, num_beams * vocab_size ) # (batch_size, num_beams * vocab_size) next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) # next batch beam content next_batch_beam = [] # for each sentence for batch_idx in range(batch_size): # if we are done with this sentence if done[batch_idx]: assert ( len(generated_hyps[batch_idx]) >= num_beams ), "Batch can only be done if at least {} beams have been generated".format(num_beams) assert ( eos_token_id is not None and pad_token_id is not None ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue # next sentence beam content next_sent_beam = [] # next tokens for this sentence for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens[batch_idx], next_scores[batch_idx]) ): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size effective_beam_id = batch_idx * num_beams + beam_id # add to generated hypotheses if end of sentence or last iteration if (eos_token_id is not None) and (token_id.item() == eos_token_id): # if beam_token does not belong to top num_beams tokens, it should not be added is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams if is_beam_token_worse_than_top_num_beams: continue generated_hyps[batch_idx].add( input_ids[effective_beam_id].clone(), beam_token_score.item(), ) else: # add next predicted token if it is not eos_token next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) # the beam for next step is full if len(next_sent_beam) == num_beams: break # Check if were done so that we can save a pad step if all(done) done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( next_scores[batch_idx].max().item(), cur_len=cur_len ) # update next beam content assert len(next_sent_beam) == num_beams, "Beam should always be full" next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == num_beams * (batch_idx + 1) # stop when we are done with each sentence if all(done): break # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) beam_idx = input_ids.new([x[2] for x in next_batch_beam]) # re-order batch and update current length input_ids = input_ids[beam_idx, :] input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) cur_len = cur_len + 1 # re-order internal states if past is not None: past = self._reorder_cache(past, beam_idx) # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = torch.cat( [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 ) # finalize all open beam hypotheses and end to generated hypotheses for batch_idx in range(batch_size): if done[batch_idx]: continue # test that beam scores match previously calculated scores if not eos and batch_idx not done if eos_token_id is not None and all( (token_id % vocab_size).item() is not eos_token_id for token_id in next_tokens[batch_idx] ): assert torch.all( next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx] ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx], ) # need to add best num_beams hypotheses to generated hyps for beam_id in range(num_beams): effective_beam_id = batch_idx * num_beams + beam_id final_score = beam_scores[effective_beam_id].item() final_tokens = input_ids[effective_beam_id] generated_hyps[batch_idx].add(final_tokens, final_score) # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch output_batch_size = batch_size if do_sample else batch_size * num_return_sequences output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences # select the best hypotheses sent_lengths = input_ids.new(output_batch_size) best = [] # retrieve best hypotheses for i, hypotheses in enumerate(generated_hyps): sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) for j in range(output_num_return_sequences_per_batch): effective_batch_idx = output_num_return_sequences_per_batch * i + j best_hyp = sorted_hyps.pop()[1] sent_lengths[effective_batch_idx] = len(best_hyp) best.append(best_hyp) # shorter batches are filled with pad_token if sent_lengths.min().item() != sent_lengths.max().item(): assert pad_token_id is not None, "`Pad_token_id` has to be defined" sent_max_len = min(sent_lengths.max().item() + 1, max_length) decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id) # fill with hypothesis and eos_token_id if necessary for i, hypo in enumerate(best): decoded[i, : sent_lengths[i]] = hypo if sent_lengths[i] < max_length: decoded[i, sent_lengths[i]] = eos_token_id else: # none of the hypotheses have an eos_token assert (len(hypo) == max_length for hypo in best) decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device) return decoded @staticmethod def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]: return tuple(layer_past.index_select(1, beam_idx) for layer_past in past) def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None: """Copied from fairseq for no_repeat_ngram in beam_search""" if cur_len + 1 < no_repeat_ngram_size: # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): gen_tokens = prev_input_ids[idx].tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - no_repeat_ngram_size ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist()) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] return banned_tokens def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]) -> Iterable[int]: banned_tokens = [] def _tokens_match(prev_tokens, tokens): if len(tokens) == 0: # if bad word tokens is just one token always ban it return True if len(tokens) > len(prev_input_ids): # if bad word tokens are longer then prev input_ids they can't be equal return False if prev_tokens[-len(tokens) :] == tokens: # if tokens match return True else: return False for prev_input_ids_slice in prev_input_ids: banned_tokens_slice = [] for banned_token_seq in bad_words_ids: assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( bad_words_ids ) if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False: # if tokens do not match continue continue banned_tokens_slice.append(banned_token_seq[-1]) banned_tokens.append(banned_tokens_slice) return banned_tokens def top_k_top_p_filtering( logits: Tensor, top_k: int = 0, top_p: float = 1.0, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1, ) -> Tensor: """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) if top_k > 0: keep only top k tokens with highest probability (top-k filtering). if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) Make sure we keep at least min_tokens_to_keep per batch example in the output From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ if top_k > 0: top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) logits[indices_to_remove] = filter_value return logits class BeamHypotheses(object): def __init__(self, num_beams, max_length, length_penalty, early_stopping): """ Initialize n-best list of hypotheses. """ self.max_length = max_length - 1 # ignoring bos_token self.length_penalty = length_penalty self.early_stopping = early_stopping self.num_beams = num_beams self.beams = [] self.worst_score = 1e9 def __len__(self): """ Number of hypotheses in the list. """ return len(self.beams) def add(self, hyp, sum_logprobs): """ Add a new hypothesis to the list. """ score = sum_logprobs / len(hyp) ** self.length_penalty if len(self) < self.num_beams or score > self.worst_score: self.beams.append((score, hyp)) if len(self) > self.num_beams: sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) del self.beams[sorted_scores[0][1]] self.worst_score = sorted_scores[1][0] else: self.worst_score = min(score, self.worst_score) def is_done(self, best_sum_logprobs, cur_len=None): """ If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst one in the heap, then we are done with this sentence. """ if len(self) < self.num_beams: return False elif self.early_stopping: return True else: if cur_len is None: cur_len = self.max_length cur_score = best_sum_logprobs / cur_len ** self.length_penalty ret = self.worst_score >= cur_score return ret class Conv1D(nn.Module): def __init__(self, nf, nx): """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) Basically works like a Linear layer but the weights are transposed """ super().__init__() self.nf = nf w = torch.empty(nx, nf) nn.init.normal_(w, std=0.02) self.weight = nn.Parameter(w) self.bias = nn.Parameter(torch.zeros(nf)) def forward(self, x): size_out = x.size()[:-1] + (self.nf,) x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) x = x.view(*size_out) return x class PoolerStartLogits(nn.Module): """ Compute SQuAD start_logits from sequence hidden states. """ def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, 1) def forward(self, hidden_states, p_mask=None): """ Args: **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)` invalid position mask such as query and special symbols (PAD, SEP, CLS) 1.0 means token should be masked. """ x = self.dense(hidden_states).squeeze(-1) if p_mask is not None: if next(self.parameters()).dtype == torch.float16: x = x * (1 - p_mask) - 65500 * p_mask else: x = x * (1 - p_mask) - 1e30 * p_mask return x class PoolerEndLogits(nn.Module): """ Compute SQuAD end_logits from sequence hidden states and start token hidden state. """ def __init__(self, config): super().__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) self.activation = nn.Tanh() self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dense_1 = nn.Linear(config.hidden_size, 1) def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None): """ Args: One of ``start_states``, ``start_positions`` should be not None. If both are set, ``start_positions`` overrides ``start_states``. **start_states**: ``torch.LongTensor`` of shape identical to hidden_states hidden states of the first tokens for the labeled span. **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the first token for the labeled span: **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` Mask of invalid position such as query and special symbols (PAD, SEP, CLS) 1.0 means token should be masked. """ assert ( start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: slen, hsz = hidden_states.shape[-2:] start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) x = self.activation(x) x = self.LayerNorm(x) x = self.dense_1(x).squeeze(-1) if p_mask is not None: if next(self.parameters()).dtype == torch.float16: x = x * (1 - p_mask) - 65500 * p_mask else: x = x * (1 - p_mask) - 1e30 * p_mask return x class PoolerAnswerClass(nn.Module): """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ def __init__(self, config): super().__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) self.activation = nn.Tanh() self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None): """ Args: One of ``start_states``, ``start_positions`` should be not None. If both are set, ``start_positions`` overrides ``start_states``. **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``. hidden states of the first tokens for the labeled span. **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the first token for the labeled span. **cls_index**: torch.LongTensor of shape ``(batch_size,)`` position of the CLS token. If None, take the last token. note(Original repo): no dependency on end_feature so that we can obtain one single `cls_logits` for each sample """ hsz = hidden_states.shape[-1] assert ( start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) if cls_index is not None: cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) else: cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) x = self.activation(x) x = self.dense_1(x).squeeze(-1) return x class SQuADHead(nn.Module): r""" A SQuAD head inspired by XLNet. Parameters: config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model. Inputs: **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)`` hidden states of sequence tokens **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the first token for the labeled span. **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the last token for the labeled span. **cls_index**: torch.LongTensor of shape ``(batch_size,)`` position of the CLS token. If None, take the last token. **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)`` Whether the question has a possible answer in the paragraph or not. **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` Mask of invalid position such as query and special symbols (PAD, SEP, CLS) 1.0 means token should be masked. Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)`` Log probabilities for the top config.start_n_top start token possibilities (beam-search). **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)`` Indices for the top config.start_n_top start token possibilities (beam-search). **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.FloatTensor`` of shape ``(batch_size,)`` Log probabilities for the ``is_impossible`` label of the answers. """ def __init__(self, config): super().__init__() self.start_n_top = config.start_n_top self.end_n_top = config.end_n_top self.start_logits = PoolerStartLogits(config) self.end_logits = PoolerEndLogits(config) self.answer_class = PoolerAnswerClass(config) def forward( self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None, ): outputs = () start_logits = self.start_logits(hidden_states, p_mask=p_mask) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, let's remove the dimension added by batch splitting for x in (start_positions, end_positions, cls_index, is_impossible): if x is not None and x.dim() > 1: x.squeeze_(-1) # during training, compute the end logits based on the ground truth of the start position end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) loss_fct = CrossEntropyLoss() start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if cls_index is not None and is_impossible is not None: # Predict answerability from the representation of CLS and START cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) loss_fct_cls = nn.BCEWithLogitsLoss() cls_loss = loss_fct_cls(cls_logits, is_impossible) # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 outputs = (total_loss,) + outputs else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) start_top_log_probs, start_top_index = torch.topk( start_log_probs, self.start_n_top, dim=-1 ) # shape (bsz, start_n_top) start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( start_states ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) end_top_log_probs, end_top_index = torch.topk( end_log_probs, self.end_n_top, dim=1 ) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits,) + outputs # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits # or (if labels are provided) (total_loss,) return outputs class SequenceSummary(nn.Module): r""" Compute a single vector summary of a sequence hidden states according to various possibilities: Args of the config class: summary_type: - 'last' => [default] take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj: Add a projection after the vector extraction summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ def __init__(self, config: PretrainedConfig): super().__init__() self.summary_type = getattr(config, "summary_type", "last") if self.summary_type == "attn": # We should use a standard multi-head attention module with absolute positional embedding for that. # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # We can probably just use the multi-head attention module of PyTorch >=1.1.0 raise NotImplementedError self.summary = Identity() if hasattr(config, "summary_use_proj") and config.summary_use_proj: if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: num_classes = config.num_labels else: num_classes = config.hidden_size self.summary = nn.Linear(config.hidden_size, num_classes) activation_string = getattr(config, "summary_activation", None) self.activation: Callable = (get_activation(activation_string) if activation_string else Identity()) self.first_dropout = Identity() if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: self.first_dropout = nn.Dropout(config.summary_first_dropout) self.last_dropout = Identity() if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: self.last_dropout = nn.Dropout(config.summary_last_dropout) def forward(self, hidden_states, cls_index=None): """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer. cls_index: [optional] position of the classification token if summary_type == 'cls_index', shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. if summary_type == 'cls_index' and cls_index is None: we take the last token of the sequence as classification token """ if self.summary_type == "last": output = hidden_states[:, -1] elif self.summary_type == "first": output = hidden_states[:, 0] elif self.summary_type == "mean": output = hidden_states.mean(dim=1) elif self.summary_type == "cls_index": if cls_index is None: cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,) else: cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) elif self.summary_type == "attn": raise NotImplementedError output = self.first_dropout(output) output = self.summary(output) output = self.activation(output) output = self.last_dropout(output) return output def create_position_ids_from_input_ids(input_ids, padding_idx): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param torch.Tensor x: :return torch.Tensor: """ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. mask = input_ids.ne(padding_idx).int() incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask return incremental_indices.long() + padding_idx def prune_linear_layer(layer, index, dim=0): """ Prune a linear layer (a model parameters) to keep only entries in index. Return the pruned layer as a new layer with requires_grad=True. Used to remove heads. """ index = index.to(layer.weight.device) W = layer.weight.index_select(dim, index).clone().detach() if layer.bias is not None: if dim == 1: b = layer.bias.clone().detach() else: b = layer.bias[index].clone().detach() new_size = list(layer.weight.size()) new_size[dim] = len(index) new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) new_layer.weight.requires_grad = False new_layer.weight.copy_(W.contiguous()) new_layer.weight.requires_grad = True if layer.bias is not None: new_layer.bias.requires_grad = False new_layer.bias.copy_(b.contiguous()) new_layer.bias.requires_grad = True return new_layer def prune_conv1d_layer(layer, index, dim=1): """ Prune a Conv1D layer (a model parameters) to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed. Return the pruned layer as a new layer with requires_grad=True. Used to remove heads. """ index = index.to(layer.weight.device) W = layer.weight.index_select(dim, index).clone().detach() if dim == 0: b = layer.bias.clone().detach() else: b = layer.bias[index].clone().detach() new_size = list(layer.weight.size()) new_size[dim] = len(index) new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) new_layer.weight.requires_grad = False new_layer.weight.copy_(W.contiguous()) new_layer.weight.requires_grad = True new_layer.bias.requires_grad = False new_layer.bias.copy_(b.contiguous()) new_layer.bias.requires_grad = True return new_layer def prune_layer(layer, index, dim=None): """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index. Return the pruned layer as a new layer with requires_grad=True. Used to remove heads. """ if isinstance(layer, nn.Linear): return prune_linear_layer(layer, index, dim=0 if dim is None else dim) elif isinstance(layer, Conv1D): return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) else: raise ValueError("Can't prune layer of class {}".format(layer.__class__)) def apply_chunking_to_forward( chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors ) -> torch.Tensor: """ This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory. If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as not applying it. Args: chunk_size: int - the chunk size of a chunked tensor. `num_chunks` = `len(input_tensors[0]) / chunk_size` chunk_dim: int - the dimension over which the input_tensors should be chunked forward_fn: fn - the forward fn of the model input_tensors: tuple(torch.Tensor) - the input tensors of `forward_fn` which are chunked Returns: a Tensor with the same shape the foward_fn would have given if applied Examples:: # rename the usual forward() fn to forward_chunk() def forward_chunk(self, hidden_states): hidden_states = self.decoder(hidden_states) return hidden_states # implement a chunked forward function def forward(self, hidden_states): return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states) """ assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors) tensor_shape = input_tensors[0].shape assert all( input_tensor.shape == tensor_shape for input_tensor in input_tensors ), "All input tenors have to be of the same shape" # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compability num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters) assert num_args_in_forward_chunk_fn == len( input_tensors ), "forward_chunk_fn expects {} arguments, but only {} input tensors are given".format( num_args_in_forward_chunk_fn, len(input_tensors) ) if chunk_size > 0: assert ( input_tensors[0].shape[chunk_dim] % chunk_size == 0 ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format( input_tensors[0][chunk_dim], chunk_size ) num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size # chunk input tensor into tuples input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors) # apply forward fn to every tuple output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)) # concatenate output at same dimension return torch.cat(output_chunks, dim=chunk_dim) return forward_fn(*input_tensors) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_xlm.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch XLM model. """ import itertools import logging import math import numpy as np import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F from .activations import gelu from .configuration_xlm import XLMConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead, prune_linear_layer logger = logging.getLogger(__name__) XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlm-mlm-en-2048", "xlm-mlm-ende-1024", "xlm-mlm-enfr-1024", "xlm-mlm-enro-1024", "xlm-mlm-tlm-xnli15-1024", "xlm-mlm-xnli15-1024", "xlm-clm-enfr-1024", "xlm-clm-ende-1024", "xlm-mlm-17-1280", "xlm-mlm-100-1280", # See all XLM models at https://huggingface.co/models?filter=xlm ] def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False def get_masks(slen, lengths, causal, padding_mask=None): """ Generate hidden states mask, and optionally an attention mask. """ alen = torch.arange(slen, dtype=torch.long, device=lengths.device) if padding_mask is not None: mask = padding_mask else: assert lengths.max().item() <= slen mask = alen < lengths[:, None] # attention mask is the same as mask, or triangular inferior attention (causal) bs = lengths.size(0) if causal: attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None] else: attn_mask = mask # sanity check assert mask.size() == (bs, slen) assert causal is False or attn_mask.size() == (bs, slen, slen) return mask, attn_mask class MultiHeadAttention(nn.Module): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config): super().__init__() self.layer_id = next(MultiHeadAttention.NEW_ID) self.output_attentions = config.output_attentions self.dim = dim self.n_heads = n_heads self.dropout = config.attention_dropout assert self.dim % self.n_heads == 0 self.q_lin = nn.Linear(dim, dim) self.k_lin = nn.Linear(dim, dim) self.v_lin = nn.Linear(dim, dim) self.out_lin = nn.Linear(dim, dim) self.pruned_heads = set() def prune_heads(self, heads): attention_head_size = self.dim // self.n_heads if len(heads) == 0: return mask = torch.ones(self.n_heads, attention_head_size) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.q_lin = prune_linear_layer(self.q_lin, index) self.k_lin = prune_linear_layer(self.k_lin, index) self.v_lin = prune_linear_layer(self.v_lin, index) self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, input, mask, kv=None, cache=None, head_mask=None): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = input.size() if kv is None: klen = qlen if cache is None else cache["slen"] + qlen else: klen = kv.size(1) # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) n_heads = self.n_heads dim_per_head = self.dim // n_heads mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen) def shape(x): """ projection """ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) def unshape(x): """ compute context """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, qlen, klen) weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) outputs = (self.out_lin(context),) if self.output_attentions: outputs = outputs + (weights,) return outputs class TransformerFFN(nn.Module): def __init__(self, in_dim, dim_hidden, out_dim, config): super().__init__() self.dropout = config.dropout self.lin1 = nn.Linear(in_dim, dim_hidden) self.lin2 = nn.Linear(dim_hidden, out_dim) self.act = gelu if config.gelu_activation else F.relu def forward(self, input): x = self.lin1(input) x = self.act(x) x = self.lin2(x) x = F.dropout(x, p=self.dropout, training=self.training) return x class XLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLMConfig load_tf_weights = None base_model_prefix = "transformer" def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) @property def dummy_inputs(self): inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) if self.config.use_lang_emb and self.config.n_langs > 1: langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) else: langs_list = None return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, nn.Embedding): if self.config is not None and self.config.embed_init_std is not None: nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std) if isinstance(module, nn.Linear): if self.config is not None and self.config.init_std is not None: nn.init.normal_(module.weight, mean=0, std=self.config.init_std) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, 0.0) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) XLM_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.XLMConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLM_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ langs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are languages ids which can be obtained from the language names by using two conversion mappings provided in the configuration of the model (only provided for multilingual models). More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). See usage examples detailed in the `multilingual documentation `__. token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`): dictionary with ``torch.FloatTensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.", XLM_START_DOCSTRING, ) class XLMModel(XLMPreTrainedModel): def __init__(self, config): # , dico, is_encoder, with_output): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states # encoder / decoder, output layer self.is_encoder = config.is_encoder self.is_decoder = not config.is_encoder if self.is_decoder: raise NotImplementedError("Currently XLM can only be used as an encoder") # self.with_output = with_output self.causal = config.causal # dictionary / languages self.n_langs = config.n_langs self.use_lang_emb = config.use_lang_emb self.n_words = config.n_words self.eos_index = config.eos_index self.pad_index = config.pad_index # self.dico = dico # self.id2lang = config.id2lang # self.lang2id = config.lang2id # assert len(self.dico) == self.n_words # assert len(self.id2lang) == len(self.lang2id) == self.n_langs # model parameters self.dim = config.emb_dim # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default self.n_heads = config.n_heads # 8 by default self.n_layers = config.n_layers self.dropout = config.dropout self.attention_dropout = config.attention_dropout assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" # embeddings self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) if config.sinusoidal_embeddings: create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) if config.n_langs > 1 and config.use_lang_emb: self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps) # transformer layers self.attentions = nn.ModuleList() self.layer_norm1 = nn.ModuleList() self.ffns = nn.ModuleList() self.layer_norm2 = nn.ModuleList() # if self.is_decoder: # self.layer_norm15 = nn.ModuleList() # self.encoder_attn = nn.ModuleList() for _ in range(self.n_layers): self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config)) self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # if self.is_decoder: # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config)) self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) if hasattr(config, "pruned_heads"): pruned_heads = config.pruned_heads.copy().items() config.pruned_heads = {} for layer, heads in pruned_heads: if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) self.init_weights() def get_input_embeddings(self): return self.embeddings def set_input_embeddings(self, new_embeddings): self.embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.attentions[layer].prune_heads(heads) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMModel import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMModel.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None: bs, slen = input_ids.size() else: bs, slen = inputs_embeds.size()[:-1] if lengths is None: if input_ids is not None: lengths = (input_ids != self.pad_index).sum(dim=1).long() else: lengths = torch.LongTensor([slen] * bs) # mask = input_ids != self.pad_index # check inputs assert lengths.size(0) == bs assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] device = input_ids.device if input_ids is not None else inputs_embeds.device # position_ids if position_ids is None: position_ids = torch.arange(slen, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand((bs, slen)) else: assert position_ids.size() == (bs, slen) # (slen, bs) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: assert langs.size() == (bs, slen) # (slen, bs) # langs = langs.transpose(0, 1) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layers) # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds) if langs is not None and self.use_lang_emb and self.n_langs > 1: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = F.dropout(tensor, p=self.dropout, training=self.training) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i]) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) class XLMPredLayer(nn.Module): """ Prediction layer (cross_entropy or adaptive_softmax). """ def __init__(self, config): super().__init__() self.asm = config.asm self.n_words = config.n_words self.pad_index = config.pad_index dim = config.emb_dim if config.asm is False: self.proj = nn.Linear(dim, config.n_words, bias=True) else: self.proj = nn.AdaptiveLogSoftmaxWithLoss( in_features=dim, n_classes=config.n_words, cutoffs=config.asm_cutoffs, div_value=config.asm_div_value, head_bias=True, # default is False ) def forward(self, x, y=None): """ Compute the loss, and optionally the scores. """ outputs = () if self.asm is False: scores = self.proj(x) outputs = (scores,) + outputs if y is not None: loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean") outputs = (loss,) + outputs else: scores = self.proj.log_prob(x) outputs = (scores,) + outputs if y is not None: _, loss = self.proj(x, y) outputs = (loss,) + outputs return outputs @add_start_docstrings( """The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLM_START_DOCSTRING, ) class XLMWithLMHeadModel(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLMModel(config) self.pred_layer = XLMPredLayer(config) self.init_weights() def get_output_embeddings(self): return self.pred_layer.proj def prepare_inputs_for_generation(self, input_ids, **kwargs): mask_token_id = self.config.mask_token_id lang_id = self.config.lang_id effective_batch_size = input_ids.shape[0] mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device) input_ids = torch.cat([input_ids, mask_token], dim=1) if lang_id is not None: langs = torch.full_like(input_ids, lang_id) else: langs = None return {"input_ids": input_ids, "langs": langs} @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMWithLMHeadModel import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) output = transformer_outputs[0] outputs = self.pred_layer(output, labels) outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_START_DOCSTRING, ) class XLMForSequenceClassification(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLMModel(config) self.sequence_summary = SequenceSummary(config) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForSequenceClassification import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) output = transformer_outputs[0] logits = self.sequence_summary(output) outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs @add_start_docstrings( """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLM_START_DOCSTRING, ) class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLMModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForQuestionAnsweringSimple import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = transformer_outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = ( start_logits, end_logits, ) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLM_START_DOCSTRING, ) class XLMForQuestionAnswering(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLMModel(config) self.qa_outputs = SQuADHead(config) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels whether a question has an answer or no answer (SQuAD 2.0) cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for position (index) of the classification token to use as input for computing plausibility of the answer. p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be masked. 0.0 mean token is not masked. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top config.start_n_top start token possibilities (beam-search). start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top config.start_n_top start token possibilities (beam-search). end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the ``is_impossible`` label of the answers. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForQuestionAnswering import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) output = transformer_outputs[0] outputs = self.qa_outputs( output, start_positions=start_positions, end_positions=end_positions, cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask, ) outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM_START_DOCSTRING, ) class XLMForTokenClassification(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLMModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForTokenClassification import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch XLM-RoBERTa model. """ import logging from .configuration_xlm_roberta import XLMRobertaConfig from .file_utils import add_start_docstrings from .modeling_roberta import ( RobertaForMaskedLM, RobertaForMultipleChoice, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, ) logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlm-roberta-base", "xlm-roberta-large", "xlm-roberta-large-finetuned-conll02-dutch", "xlm-roberta-large-finetuned-conll02-spanish", "xlm-roberta-large-finetuned-conll03-english", "xlm-roberta-large-finetuned-conll03-german", # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta ] XLM_ROBERTA_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaModel(RobertaModel): """ This class overrides :class:`~transformers1.RobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForMaskedLM(RobertaForMaskedLM): """ This class overrides :class:`~transformers1.RobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): """ This class overrides :class:`~transformers1.RobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForMultipleChoice(RobertaForMultipleChoice): """ This class overrides :class:`~transformers1.RobertaForMultipleChoice`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForTokenClassification(RobertaForTokenClassification): """ This class overrides :class:`~transformers1.RobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig ================================================ FILE: code/bert-base-count3/pretrain/transformers1/modeling_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch XLNet model. """ import logging import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F from .activations import gelu_new, swish from .configuration_xlnet import XLNetConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary logger = logging.getLogger(__name__) XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlnet-base-cased", "xlnet-large-cased", # See all XLNet models at https://huggingface.co/models?filter=xlnet ] def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): """ A map of modules from TF to PyTorch. I use a map to keep the PyTorch model as identical to the original PyTorch model as possible. """ tf_to_pt_map = {} if hasattr(model, "transformer"): if hasattr(model, "lm_loss"): # We will load also the output bias tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights: # We will load also the sequence summary tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias if ( hasattr(model, "logits_proj") and config.finetuning_task is not None and "model/regression_{}/logit/kernel".format(config.finetuning_task) in tf_weights ): tf_to_pt_map["model/regression_{}/logit/kernel".format(config.finetuning_task)] = model.logits_proj.weight tf_to_pt_map["model/regression_{}/logit/bias".format(config.finetuning_task)] = model.logits_proj.bias # Now load the rest of the transformer model = model.transformer # Embeddings and output tf_to_pt_map.update( { "model/transformer/word_embedding/lookup_table": model.word_embedding.weight, "model/transformer/mask_emb/mask_emb": model.mask_emb, } ) # Transformer blocks for i, b in enumerate(model.layer): layer_str = "model/transformer/layer_%d/" % i tf_to_pt_map.update( { layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight, layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias, layer_str + "rel_attn/o/kernel": b.rel_attn.o, layer_str + "rel_attn/q/kernel": b.rel_attn.q, layer_str + "rel_attn/k/kernel": b.rel_attn.k, layer_str + "rel_attn/r/kernel": b.rel_attn.r, layer_str + "rel_attn/v/kernel": b.rel_attn.v, layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight, layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias, layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight, layer_str + "ff/layer_1/bias": b.ff.layer_1.bias, layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight, layer_str + "ff/layer_2/bias": b.ff.layer_2.bias, } ) # Relative positioning biases if config.untie_r: r_r_list = [] r_w_list = [] r_s_list = [] seg_embed_list = [] for b in model.layer: r_r_list.append(b.rel_attn.r_r_bias) r_w_list.append(b.rel_attn.r_w_bias) r_s_list.append(b.rel_attn.r_s_bias) seg_embed_list.append(b.rel_attn.seg_embed) else: r_r_list = [model.r_r_bias] r_w_list = [model.r_w_bias] r_s_list = [model.r_s_bias] seg_embed_list = [model.seg_embed] tf_to_pt_map.update( { "model/transformer/r_r_bias": r_r_list, "model/transformer/r_w_bias": r_w_list, "model/transformer/r_s_bias": r_s_list, "model/transformer/seg_embed": seg_embed_list, } ) return tf_to_pt_map def load_tf_weights_in_xlnet(model, config, tf_path): """ Load tf checkpoints in a pytorch model """ try: import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise # Load weights from TF model init_vars = tf.train.list_variables(tf_path) tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) tf_weights[name] = array # Build TF to PyTorch weights loading map tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights) for name, pointer in tf_to_pt_map.items(): logger.info("Importing {}".format(name)) if name not in tf_weights: logger.info("{} not in tf pre-trained weights, skipping".format(name)) continue array = tf_weights[name] # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name): logger.info("Transposing") array = np.transpose(array) if isinstance(pointer, list): # Here we will split the TF weights assert len(pointer) == array.shape[0] for i, p_i in enumerate(pointer): arr_i = array[i, ...] try: assert p_i.shape == arr_i.shape except AssertionError as e: e.args += (p_i.shape, arr_i.shape) raise logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) p_i.data = torch.from_numpy(arr_i) else: try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) tf_weights.pop(name, None) tf_weights.pop(name + "/Adam", None) tf_weights.pop(name + "/Adam_1", None) logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) return model ACT2FN = {"gelu": gelu_new, "relu": torch.nn.functional.relu, "swish": swish} XLNetLayerNorm = nn.LayerNorm class XLNetRelativeAttention(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions if config.d_model % config.n_head != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.d_model, config.n_head) ) self.n_head = config.n_head self.d_head = config.d_head self.d_model = config.d_model self.scale = 1 / (config.d_head ** 0.5) self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head)) self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.dropout) def prune_heads(self, heads): raise NotImplementedError @staticmethod def rel_shift(x, klen=-1): """perform relative shift to form the relative attention score.""" x_size = x.shape x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3]) x = x[1:, ...] x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3]) # x = x[:, 0:klen, :, :] x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long)) return x @staticmethod def rel_shift_bnij(x, klen=-1): x_size = x.shape x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2]) x = x[:, :, 1:, :] x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1) # Note: the tensor-slice form was faster in my testing than torch.index_select # However, tracing doesn't like the nature of the slice, and if klen changes # during the run then it'll fail, whereas index_select will be fine. x = torch.index_select(x, 3, torch.arange(klen, device=x.device, dtype=torch.long)) # x = x[:, :, :, :klen] return x def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None, head_mask=None): """Core relative positional attention operations.""" # content based attention score ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h) # position based attention score bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r) bd = self.rel_shift_bnij(bd, klen=ac.shape[3]) # segment based attention score if seg_mat is None: ef = 0 else: ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed) ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef) # merge attention scores and perform masking attn_score = (ac + bd + ef) * self.scale if attn_mask is not None: # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask if attn_mask.dtype == torch.float16: attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask) else: attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask) # attention probability attn_prob = F.softmax(attn_score, dim=3) attn_prob = self.dropout(attn_prob) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask) # attention output attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h) if self.output_attentions: return attn_vec, torch.einsum("bnij->ijbn", attn_prob) return attn_vec def post_attention(self, h, attn_vec, residual=True): """Post-attention processing.""" # post-attention projection (back to `d_model`) attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o) attn_out = self.dropout(attn_out) if residual: attn_out = attn_out + h output = self.layer_norm(attn_out) return output def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None): if g is not None: # Two-stream attention with relative positional encoding. # content based attention score if mems is not None and mems.dim() > 1: cat = torch.cat([mems, h], dim=0) else: cat = h # content-based key head k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k) # content-based value head v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v) # position-based key head k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r) # h-stream # content-stream query head q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q) # core attention ops attn_vec_h = self.rel_attn_core( q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask ) if self.output_attentions: attn_vec_h, attn_prob_h = attn_vec_h # post processing output_h = self.post_attention(h, attn_vec_h) # g-stream # query-stream query head q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q) # core attention ops if target_mapping is not None: q_head_g = torch.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g attn_vec_g = torch.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g # post processing output_g = self.post_attention(g, attn_vec_g) if self.output_attentions: attn_prob = attn_prob_h, attn_prob_g else: # Multi-head attention with relative positional encoding if mems is not None and mems.dim() > 1: cat = torch.cat([mems, h], dim=0) else: cat = h # content heads q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q) k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k) v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v) # positional heads k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r) # core attention ops attn_vec = self.rel_attn_core( q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask ) if self.output_attentions: attn_vec, attn_prob = attn_vec # post processing output_h = self.post_attention(h, attn_vec) output_g = None outputs = (output_h, output_g) if self.output_attentions: outputs = outputs + (attn_prob,) return outputs class XLNetFeedForward(nn.Module): def __init__(self, config): super().__init__() self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps) self.layer_1 = nn.Linear(config.d_model, config.d_inner) self.layer_2 = nn.Linear(config.d_inner, config.d_model) self.dropout = nn.Dropout(config.dropout) if isinstance(config.ff_activation, str): self.activation_function = ACT2FN[config.ff_activation] else: self.activation_function = config.ff_activation def forward(self, inp): output = inp output = self.layer_1(output) output = self.activation_function(output) output = self.dropout(output) output = self.layer_2(output) output = self.dropout(output) output = self.layer_norm(output + inp) return output class XLNetLayer(nn.Module): def __init__(self, config): super().__init__() self.rel_attn = XLNetRelativeAttention(config) self.ff = XLNetFeedForward(config) self.dropout = nn.Dropout(config.dropout) def forward( self, output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None ): outputs = self.rel_attn( output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=mems, target_mapping=target_mapping, head_mask=head_mask, ) output_h, output_g = outputs[:2] if output_g is not None: output_g = self.ff(output_g) output_h = self.ff(output_h) outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there return outputs class XLNetPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLNetConfig load_tf_weights = load_tf_weights_in_xlnet base_model_prefix = "transformer" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, XLNetLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, XLNetRelativeAttention): for param in [ module.q, module.k, module.v, module.o, module.r, module.r_r_bias, module.r_s_bias, module.r_w_bias, module.seg_embed, ]: param.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, XLNetModel): module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range) XLNET_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.XLNetConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLNET_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. `use_cache` has to be set to `True` to make use of `mems`. perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``: If ``perm_mask[k, i, j] = 0``, i attend to j in batch k; if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k. If None, each token attends to all the others (full bidirectional attention). Only used during pretraining (to define factorization order) or for sequential decoding (generation). target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the output tokens to use. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding (generation). token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token. The classifier token should be represented by a ``2``. `What are token type IDs? <../glossary.html#token-type-ids>`_ input_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding. Kept for compatibility with the original code base. You can only uses one of `input_mask` and `attention_mask` Mask values selected in ``[0, 1]``: ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. use_cache (:obj:`bool`): If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. """ @add_start_docstrings( "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.", XLNET_START_DOCSTRING, ) class XLNetModel(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.mem_len = config.mem_len self.reuse_len = config.reuse_len self.d_model = config.d_model self.same_length = config.same_length self.attn_type = config.attn_type self.bi_data = config.bi_data self.clamp_len = config.clamp_len self.n_layer = config.n_layer self.word_embedding = nn.Embedding(config.vocab_size, config.d_model) self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model)) self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)]) self.dropout = nn.Dropout(config.dropout) self.init_weights() def get_input_embeddings(self): return self.word_embedding def set_input_embeddings(self, new_embeddings): self.word_embedding = new_embeddings def _prune_heads(self, heads_to_prune): raise NotImplementedError def create_mask(self, qlen, mlen): """ Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. Args: qlen: Sequence length mlen: Mask length :: same_length=False: same_length=True: < qlen > < qlen > ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1] qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1] [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1] v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0] """ attn_mask = torch.ones([qlen, qlen]) mask_up = torch.triu(attn_mask, diagonal=1) attn_mask_pad = torch.zeros([qlen, mlen]) ret = torch.cat([attn_mask_pad, mask_up], dim=1) if self.same_length: mask_lo = torch.tril(attn_mask, diagonal=-1) ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1) ret = ret.to(self.device) return ret def cache_mem(self, curr_out, prev_mem): # cache hidden states into memory. if self.reuse_len is not None and self.reuse_len > 0: curr_out = curr_out[: self.reuse_len] if prev_mem is None: new_mem = curr_out[-self.mem_len :] else: new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len :] return new_mem.detach() @staticmethod def positional_embedding(pos_seq, inv_freq, bsz=None): sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq) pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1) pos_emb = pos_emb[:, None, :] if bsz is not None: pos_emb = pos_emb.expand(-1, bsz, -1) return pos_emb def relative_positional_encoding(self, qlen, klen, bsz=None): # create relative positional encoding. freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float) inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model)) if self.attn_type == "bi": # beg, end = klen - 1, -qlen beg, end = klen, -qlen elif self.attn_type == "uni": # beg, end = klen - 1, -1 beg, end = klen, -1 else: raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) if self.bi_data: fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float) bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float) if self.clamp_len > 0: fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) if bsz is not None: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2) else: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1) else: fwd_pos_seq = torch.arange(beg, end, -1.0) if self.clamp_len > 0: fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz) pos_emb = pos_emb.to(self.device) return pos_emb @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, hidden_size)`): Sequence of hidden-states at the last layer of the model. `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetModel import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetModel.from_pretrained('xlnet-large-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = input_ids.transpose(0, 1).contiguous() qlen, bsz = input_ids.shape[0], input_ids.shape[1] elif inputs_embeds is not None: inputs_embeds = inputs_embeds.transpose(0, 1).contiguous() qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen dtype_float = self.dtype device = self.device # Attention mask # causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = attn_mask[:, :, None, None] elif self.attn_type == "bi": attn_mask = None else: raise ValueError("Unsupported attention type: {}".format(self.attn_type)) # data mask: input mask & perm mask assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." if input_mask is None and attention_mask is not None: input_mask = 1.0 - attention_mask if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: data_mask = input_mask[None] elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # all mems can be attended to if mlen > 0: mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask) data_mask = torch.cat([mems_mask, data_mask], dim=1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] else: attn_mask += data_mask[:, :, :, None] if attn_mask is not None: attn_mask = (attn_mask > 0).to(dtype_float) if attn_mask is not None: non_tgt_mask = -torch.eye(qlen).to(attn_mask) if mlen > 0: non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1) non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask) else: non_tgt_mask = None # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k) if target_mapping is not None: word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1) # else: # We removed the inp_q input which was same as target mapping # inp_q_ext = inp_q[:, :, None] # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k output_g = self.dropout(word_emb_q) else: output_g = None # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device) cat_ids = torch.cat([mem_pad, token_type_ids], dim=0) else: cat_ids = token_type_ids # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long() seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float) else: seg_mat = None # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz) pos_emb = self.dropout(pos_emb) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0) head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) head_mask = head_mask.to( dtype=next(self.parameters()).dtype ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer new_mems = () if mems is None: mems = [None] * len(self.layer) attentions = [] hidden_states = [] for i, layer_module in enumerate(self.layer): if self.mem_len is not None and self.mem_len > 0 and use_cache is True: # cache new mems new_mems = new_mems + (self.cache_mem(output_h, mems[i]),) if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) outputs = layer_module( output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping, head_mask=head_mask[i], ) output_h, output_g = outputs[:2] if self.output_attentions: attentions.append(outputs[2]) # Add last hidden state if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) outputs = (output.permute(1, 0, 2).contiguous(),) if self.mem_len is not None and self.mem_len > 0 and use_cache is True: outputs = outputs + (new_mems,) if self.output_hidden_states: if output_g is not None: hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs) else: hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states) outputs = outputs + (hidden_states,) if self.output_attentions: if target_mapping is not None: # when target_mapping is provided, there are 2-tuple of attentions attentions = tuple( tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions ) else: attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) outputs = outputs + (attentions,) return outputs # outputs, (new_mems), (hidden_states), (attentions) @add_start_docstrings( """XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLNET_START_DOCSTRING, ) class XLNetLMHeadModel(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.attn_type = config.attn_type self.same_length = config.same_length self.transformer = XLNetModel(config) self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True) self.init_weights() def get_output_embeddings(self): return self.lm_loss def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # Add dummy token at the end (no attention on this one) effective_batch_size = input_ids.shape[0] dummy_token = torch.zeros((effective_batch_size, 1), dtype=torch.long, device=input_ids.device) input_ids = torch.cat([input_ids, dummy_token], dim=1) # Build permutation mask so that previous tokens don't see last token sequence_length = input_ids.shape[1] perm_mask = torch.zeros( (effective_batch_size, sequence_length, sequence_length), dtype=torch.float, device=input_ids.device ) perm_mask[:, :, -1] = 1.0 # We'll only predict the last token target_mapping = torch.zeros( (effective_batch_size, 1, sequence_length), dtype=torch.float, device=input_ids.device ) target_mapping[0, 0, -1] = 1.0 inputs = { "input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping, "use_cache": kwargs["use_cache"], } # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): Labels for masked language modeling. `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`. The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below) Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored, the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetLMHeadModel import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling. input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0) assert labels.shape[0] == 1, 'only one word will be predicted' perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels) loss, next_token_logits = outputs[:2] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) logits = self.lm_loss(transformer_outputs[0]) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it if labels is not None: # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLNET_START_DOCSTRING, ) class XLNetForSequenceClassification(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.d_model, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`) Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForSequenceClassification import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLNET_START_DOCSTRING, ) class XLNetForTokenClassification(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`): Classification scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForTokenClassification import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) scores = outputs[0] """ outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RACE/SWAG tasks. """, XLNET_START_DOCSTRING, ) class XLNetForMultipleChoice(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.d_model, 1) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForMultipleChoice import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None transformer_outputs = self.transformer( flat_input_ids, token_type_ids=flat_token_type_ids, input_mask=flat_input_mask, attention_mask=flat_attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + transformer_outputs[ 1: ] # Keep mems, hidden states, attentions if there are in it if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLNET_START_DOCSTRING, ) class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForQuestionAnsweringSimple import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) @add_start_docstrings( """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLNET_START_DOCSTRING, ) class XLNetForQuestionAnswering(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.start_n_top = config.start_n_top self.end_n_top = config.end_n_top self.transformer = XLNetModel(config) self.start_logits = PoolerStartLogits(config) self.end_logits = PoolerEndLogits(config) self.answer_class = PoolerAnswerClass(config) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels whether a question has an answer or no answer (SQuAD 2.0) cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for position (index) of the classification token to use as input for computing plausibility of the answer. p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be masked. 0.0 mean token is not masked. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top config.start_n_top start token possibilities (beam-search). start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top config.start_n_top start token possibilities (beam-search). end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the ``is_impossible`` label of the answers. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForQuestionAnswering import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] start_logits = self.start_logits(hidden_states, p_mask=p_mask) outputs = transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it if start_positions is not None and end_positions is not None: # If we are on multi-GPU, let's remove the dimension added by batch splitting for x in (start_positions, end_positions, cls_index, is_impossible): if x is not None and x.dim() > 1: x.squeeze_(-1) # during training, compute the end logits based on the ground truth of the start position end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) loss_fct = CrossEntropyLoss() start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if cls_index is not None and is_impossible is not None: # Predict answerability from the representation of CLS and START cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) loss_fct_cls = nn.BCEWithLogitsLoss() cls_loss = loss_fct_cls(cls_logits, is_impossible) # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 outputs = (total_loss,) + outputs else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) start_top_log_probs, start_top_index = torch.topk( start_log_probs, self.start_n_top, dim=-1 ) # shape (bsz, start_n_top) start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( start_states ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) end_top_log_probs, end_top_index = torch.topk( end_log_probs, self.end_n_top, dim=1 ) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) start_states = torch.einsum( "blh,bl->bh", hidden_states, start_log_probs ) # get the representation of START as weighted sum of hidden states cls_logits = self.answer_class( hidden_states, start_states=start_states, cls_index=cls_index ) # Shape (batch size,): one single `cls_logits` for each sample outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits # or (if labels are provided) (total_loss,) return outputs ================================================ FILE: code/bert-base-count3/pretrain/transformers1/optimization.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch optimization for BERT model.""" import logging import math import torch from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR logger = logging.getLogger(__name__) def get_constant_schedule(optimizer, last_epoch=-1): """ Create a schedule with a constant learning rate. """ return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1): """ Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate increases linearly between 0 and 1. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1.0, num_warmup_steps)) return 1.0 return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): """ Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) return max( 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) ) return LambdaLR(optimizer, lr_lambda, last_epoch) def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1): """ Create a schedule with a learning rate that decreases following the values of the cosine function between 0 and `pi * cycles` after a warmup period during which it increases linearly between 0 and 1. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) return LambdaLR(optimizer, lr_lambda, last_epoch) def get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1 ): """ Create a schedule with a learning rate that decreases following the values of the cosine function with several hard restarts, after a warmup period during which it increases linearly between 0 and 1. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) if progress >= 1.0: return 0.0 return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) return LambdaLR(optimizer, lr_lambda, last_epoch) class AdamW(Optimizer): """ Implements Adam algorithm with weight decay fix. Parameters: lr (float): learning rate. Default 1e-3. betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) eps (float): Adams epsilon. Default: 1e-6 weight_decay (float): Weight decay. Default: 0.0 correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): if lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) super().__init__(params, defaults) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead") state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] beta1, beta2 = group["betas"] state["step"] += 1 # Decay the first and second moment running average coefficient # In-place operations to update the averages at the same time exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) denom = exp_avg_sq.sqrt().add_(group["eps"]) step_size = group["lr"] if group["correct_bias"]: # No bias correction for Bert bias_correction1 = 1.0 - beta1 ** state["step"] bias_correction2 = 1.0 - beta2 ** state["step"] step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(exp_avg, denom, value=-step_size) # Just adding the square of the weights to the loss function is *not* # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # # Instead we want to decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. # Add weight decay at the end (fixed version) if group["weight_decay"] > 0.0: p.data.add_(p.data, alpha=-group["lr"] * group["weight_decay"]) return loss ================================================ FILE: code/bert-base-count3/pretrain/transformers1/optimization_tf.py ================================================ # Copyright 2019 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Functions and classes related to optimization (weight updates).""" import re import tensorflow as tf class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): """Applies a warmup schedule on a given learning rate decay schedule.""" def __init__( self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None, ): super().__init__() self.initial_learning_rate = initial_learning_rate self.warmup_steps = warmup_steps self.power = power self.decay_schedule_fn = decay_schedule_fn self.name = name def __call__(self, step): with tf.name_scope(self.name or "WarmUp") as name: # Implements polynomial warmup. i.e., if global_step < warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. global_step_float = tf.cast(step, tf.float32) warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) warmup_percent_done = global_step_float / warmup_steps_float warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power) return tf.cond( global_step_float < warmup_steps_float, lambda: warmup_learning_rate, lambda: self.decay_schedule_fn(step), name=name, ) def get_config(self): return { "initial_learning_rate": self.initial_learning_rate, "decay_schedule_fn": self.decay_schedule_fn, "warmup_steps": self.warmup_steps, "power": self.power, "name": self.name, } def create_optimizer(init_lr, num_train_steps, num_warmup_steps, end_lr=0.0, optimizer_type="adamw"): """Creates an optimizer with learning rate schedule.""" # Implements linear decay of the learning rate. lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=end_lr, ) if num_warmup_steps: lr_schedule = WarmUp( initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps, ) optimizer = AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) return optimizer class AdamWeightDecay(tf.keras.optimizers.Adam): """Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact with the m and v parameters in strange ways. Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent to adding the square of the weights to the loss with plain (non-momentum) SGD. """ def __init__( self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, amsgrad=False, weight_decay_rate=0.0, include_in_weight_decay=None, exclude_from_weight_decay=None, name="AdamWeightDecay", **kwargs ): super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) self.weight_decay_rate = weight_decay_rate self._include_in_weight_decay = include_in_weight_decay self._exclude_from_weight_decay = exclude_from_weight_decay @classmethod def from_config(cls, config): """Creates an optimizer from its config with WarmUp custom object.""" custom_objects = {"WarmUp": WarmUp} return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects) def _prepare_local(self, var_device, var_dtype, apply_state): super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state) apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant( self.weight_decay_rate, name="adam_weight_decay_rate" ) def _decay_weights_op(self, var, learning_rate, apply_state): do_decay = self._do_use_weight_decay(var.name) if do_decay: return var.assign_sub( learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"], use_locking=self._use_locking, ) return tf.no_op() def apply_gradients(self, grads_and_vars, name=None): grads, tvars = list(zip(*grads_and_vars)) return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name,) def _get_lr(self, var_device, var_dtype, apply_state): """Retrieves the learning rate with the given state.""" if apply_state is None: return self._decayed_lr_t[var_dtype], {} apply_state = apply_state or {} coefficients = apply_state.get((var_device, var_dtype)) if coefficients is None: coefficients = self._fallback_apply_state(var_device, var_dtype) apply_state[(var_device, var_dtype)] = coefficients return coefficients["lr_t"], dict(apply_state=apply_state) def _resource_apply_dense(self, grad, var, apply_state=None): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs) def _resource_apply_sparse(self, grad, var, indices, apply_state=None): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs) def get_config(self): config = super().get_config() config.update({"weight_decay_rate": self.weight_decay_rate}) return config def _do_use_weight_decay(self, param_name): """Whether to use L2 weight decay for `param_name`.""" if self.weight_decay_rate == 0: return False if self._include_in_weight_decay: for r in self._include_in_weight_decay: if re.search(r, param_name) is not None: return True if self._exclude_from_weight_decay: for r in self._exclude_from_weight_decay: if re.search(r, param_name) is not None: return False return True # Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py class GradientAccumulator(object): """Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should then call ``.gradients``, scale the gradients if required, and pass the result to ``apply_gradients``. """ # We use the ON_READ synchronization policy so that no synchronization is # performed on assignment. To get the value, we call .value() which returns the # value on the current replica without synchronization. def __init__(self): """Initializes the accumulator.""" self._gradients = [] self._accum_steps = None @property def step(self): """Number of accumulated steps.""" if self._accum_steps is None: self._accum_steps = tf.Variable( tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) return self._accum_steps.value() @property def gradients(self): """The accumulated gradients on the current replica.""" if not self._gradients: raise ValueError("The accumulator should be called first to initialize the gradients") return list(gradient.value() if gradient is not None else gradient for gradient in self._gradients) def __call__(self, gradients): """Accumulates :obj:`gradients` on the current replica.""" if not self._gradients: _ = self.step # Create the step variable. self._gradients.extend( [ tf.Variable( tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) if gradient is not None else gradient for gradient in gradients ] ) if len(gradients) != len(self._gradients): raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) for accum_gradient, gradient in zip(self._gradients, gradients): if accum_gradient is not None and gradient is not None: accum_gradient.assign_add(gradient) self._accum_steps.assign_add(1) def reset(self): """Resets the accumulated gradients on the current replica.""" if not self._gradients: return self._accum_steps.assign(0) for gradient in self._gradients: if gradient is not None: gradient.assign(tf.zeros_like(gradient)) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/pipelines.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import csv import json import logging import os import pickle import sys from abc import ABC, abstractmethod from contextlib import contextmanager from itertools import chain from os.path import abspath, exists from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union import numpy as np from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig from .configuration_utils import PretrainedConfig from .data import SquadExample, squad_convert_examples_to_features from .file_utils import is_tf_available, is_torch_available from .modelcard import ModelCard from .tokenization_auto import AutoTokenizer from .tokenization_bert import BasicTokenizer from .tokenization_utils import PreTrainedTokenizer if is_tf_available(): import tensorflow as tf from .modeling_tf_auto import ( TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification, TFAutoModelWithLMHead, ) if is_torch_available(): import torch from .modeling_auto import ( AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelWithLMHead, ) if TYPE_CHECKING: from .modeling_utils import PreTrainedModel from .modeling_tf_utils import TFPreTrainedModel logger = logging.getLogger(__name__) def get_framework(model=None): """ Select framework (TensorFlow/PyTorch) to use. If both frameworks are installed and no specific model is provided, defaults to using PyTorch. """ if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): # Both framework are available but the user supplied a model class instance. # Try to guess which framework to use from the model classname framework = "tf" if model.__class__.__name__.startswith("TF") else "pt" elif not is_tf_available() and not is_torch_available(): raise RuntimeError( "At least one of TensorFlow 2.0 or PyTorch should be installed. " "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " "To install PyTorch, read the instructions at https://pytorch.org/." ) else: # framework = 'tf' if is_tf_available() else 'pt' framework = "pt" if is_torch_available() else "tf" return framework class ArgumentHandler(ABC): """ Base interface for handling varargs for each Pipeline """ @abstractmethod def __call__(self, *args, **kwargs): raise NotImplementedError() class DefaultArgumentHandler(ArgumentHandler): """ Default varargs argument parser handling parameters for each Pipeline """ @staticmethod def handle_kwargs(kwargs: Dict) -> List: if len(kwargs) == 1: output = list(kwargs.values()) else: output = list(chain(kwargs.values())) return DefaultArgumentHandler.handle_args(output) @staticmethod def handle_args(args: Sequence[Any]) -> List[str]: # Only one argument, let's do case by case if len(args) == 1: if isinstance(args[0], str): return [args[0]] elif not isinstance(args[0], list): return list(args) else: return args[0] # Multiple arguments (x1, x2, ...) elif len(args) > 1: if all([isinstance(arg, str) for arg in args]): return list(args) # If not instance of list, then it should instance of iterable elif isinstance(args, Iterable): return list(chain.from_iterable(chain(args))) else: raise ValueError( "Invalid input type {}. Pipeline supports Union[str, Iterable[str]]".format(type(args)) ) else: return [] def __call__(self, *args, **kwargs): if len(kwargs) > 0 and len(args) > 0: raise ValueError("Pipeline cannot handle mixed args and kwargs") if len(kwargs) > 0: return DefaultArgumentHandler.handle_kwargs(kwargs) else: return DefaultArgumentHandler.handle_args(args) class PipelineDataFormat: """ Base class for all the pipeline supported data format both for reading and writing. Supported data formats currently includes: - JSON - CSV - stdin/stdout (pipe) PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. """ SUPPORTED_FORMATS = ["json", "csv", "pipe"] def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): self.output_path = output_path self.input_path = input_path self.column = column.split(",") if column is not None else [""] self.is_multi_columns = len(self.column) > 1 if self.is_multi_columns: self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column] if output_path is not None and not overwrite: if exists(abspath(self.output_path)): raise OSError("{} already exists on disk".format(self.output_path)) if input_path is not None: if not exists(abspath(self.input_path)): raise OSError("{} doesnt exist on disk".format(self.input_path)) @abstractmethod def __iter__(self): raise NotImplementedError() @abstractmethod def save(self, data: dict): """ Save the provided data object with the representation for the current `DataFormat`. :param data: data to store :return: """ raise NotImplementedError() def save_binary(self, data: Union[dict, List[dict]]) -> str: """ Save the provided data object as a pickle-formatted binary data on the disk. :param data: data to store :return: (str) Path where the data has been saved """ path, _ = os.path.splitext(self.output_path) binary_path = os.path.extsep.join((path, "pickle")) with open(binary_path, "wb+") as f_output: pickle.dump(data, f_output) return binary_path @staticmethod def from_str( format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): if format == "json": return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) elif format == "csv": return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) elif format == "pipe": return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) else: raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format)) class CsvPipelineDataFormat(PipelineDataFormat): def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): super().__init__(output_path, input_path, column, overwrite=overwrite) def __iter__(self): with open(self.input_path, "r") as f: reader = csv.DictReader(f) for row in reader: if self.is_multi_columns: yield {k: row[c] for k, c in self.column} else: yield row[self.column[0]] def save(self, data: List[dict]): with open(self.output_path, "w") as f: if len(data) > 0: writer = csv.DictWriter(f, list(data[0].keys())) writer.writeheader() writer.writerows(data) class JsonPipelineDataFormat(PipelineDataFormat): def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): super().__init__(output_path, input_path, column, overwrite=overwrite) with open(input_path, "r") as f: self._entries = json.load(f) def __iter__(self): for entry in self._entries: if self.is_multi_columns: yield {k: entry[c] for k, c in self.column} else: yield entry[self.column[0]] def save(self, data: dict): with open(self.output_path, "w") as f: json.dump(data, f) class PipedPipelineDataFormat(PipelineDataFormat): """ Read data from piped input to the python process. For multi columns data, columns should separated by \t If columns are provided, then the output will be a dictionary with {column_x: value_x} """ def __iter__(self): for line in sys.stdin: # Split for multi-columns if "\t" in line: line = line.split("\t") if self.column: # Dictionary to map arguments yield {kwargs: l for (kwargs, _), l in zip(self.column, line)} else: yield tuple(line) # No dictionary to map arguments else: yield line def save(self, data: dict): print(data) def save_binary(self, data: Union[dict, List[dict]]) -> str: if self.output_path is None: raise KeyError( "When using piped input on pipeline outputting large object requires an output file path. " "Please provide such output path through --output argument." ) return super().save_binary(data) class _ScikitCompat(ABC): """ Interface layer for the Scikit and Keras compatibility. """ @abstractmethod def transform(self, X): raise NotImplementedError() @abstractmethod def predict(self, X): raise NotImplementedError() class Pipeline(_ScikitCompat): """ The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across different pipelines. Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following operations: Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output Pipeline supports running on CPU or GPU through the device argument. Users can specify device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal. Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large tensor object as nested-lists. In order to avoid dumping such large structure as textual data we provide the binary_output constructor argument. If set to True, the output will be stored in the pickle format. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`): Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text. Return: :obj:`List` or :obj:`Dict`: Pipeline returns list or dictionary depending on: - Whether the user supplied multiple samples - Whether the pipeline exposes multiple fields in the output object """ default_input_names = None def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, task: str = "", args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False, ): if framework is None: framework = get_framework() self.model = model self.tokenizer = tokenizer self.modelcard = modelcard self.framework = framework self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device)) self.binary_output = binary_output self._args_parser = args_parser or DefaultArgumentHandler() # Special handling if self.framework == "pt" and self.device.type == "cuda": self.model = self.model.to(self.device) # Update config with task specific parameters task_specific_params = self.model.config.task_specific_params if task_specific_params is not None and task in task_specific_params: self.model.config.update(task_specific_params.get(task)) def save_pretrained(self, save_directory): """ Save the pipeline's model and tokenizer to the specified save_directory """ if not os.path.isdir(save_directory): logger.error("Provided path ({}) should be a directory".format(save_directory)) return self.model.save_pretrained(save_directory) self.tokenizer.save_pretrained(save_directory) if self.modelcard is not None: self.modelcard.save_pretrained(save_directory) def transform(self, X): """ Scikit / Keras interface to transformers1' pipelines. This method will forward to __call__(). """ return self(X=X) def predict(self, X): """ Scikit / Keras interface to transformers1' pipelines. This method will forward to __call__(). """ return self(X=X) @contextmanager def device_placement(self): """ Context Manager allowing tensor allocation on the user-specified device in framework agnostic way. example: # Explicitly ask for tensor allocation on CUDA device :0 nlp = pipeline(..., device=0) with nlp.device_placement(): # Every framework specific tensor allocation will be done on the request device output = nlp(...) Returns: Context manager """ if self.framework == "tf": with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)): yield else: if self.device.type == "cuda": torch.cuda.set_device(self.device) yield def ensure_tensor_on_device(self, **inputs): """ Ensure PyTorch tensors are on the specified device. :param inputs: :return: """ return {name: tensor.to(self.device) for name, tensor in inputs.items()} def _parse_and_tokenize(self, *args, pad_to_max_length=True, add_special_tokens=True, **kwargs): """ Parse arguments and tokenize """ # Parse arguments inputs = self._args_parser(*args, **kwargs) inputs = self.tokenizer.batch_encode_plus( inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, pad_to_max_length=pad_to_max_length, ) return inputs def __call__(self, *args, **kwargs): inputs = self._parse_and_tokenize(*args, **kwargs) return self._forward(inputs) def _forward(self, inputs, return_tensors=False): """ Internal framework specific forward dispatching. Args: inputs: dict holding all the keyworded arguments for required by the model forward method. return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array. Returns: Numpy array """ # Encode for forward with self.device_placement(): if self.framework == "tf": # TODO trace model predictions = self.model(inputs.data, training=False)[0] else: with torch.no_grad(): inputs = self.ensure_tensor_on_device(**inputs) predictions = self.model(**inputs)[0].cpu() if return_tensors: return predictions else: return predictions.numpy() class FeatureExtractionPipeline(Pipeline): """ Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer, which can be used as features in downstream tasks. This feature extraction pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "feature-extraction", for extracting features of a sequence. All models may be used for this pipeline. See a list of all models, including community-contributed models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, task: str = "", ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=args_parser, device=device, binary_output=True, task=task, ) def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs).tolist() class TextGenerationPipeline(Pipeline): """ Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt. This language generation pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "text-generation", for generating text from a specified prompt. The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available community models on `huggingface.co/models `__. """ # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia # in https://github.com/rusiaaman/XLNet-gen#methodology # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision and denounces one of the men as a horse thief. Although his father initially slaps him for making such an accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing. """ ALLOWED_MODELS = [ "XLNetLMHeadModel", "TransfoXLLMHeadModel", "ReformerModelWithLMHead", "GPT2LMHeadModel", "OpenAIGPTLMHeadModel", "CTRLLMHeadModel", "TFXLNetLMHeadModel", "TFTransfoXLLMHeadModel", "TFGPT2LMHeadModel", "TFOpenAIGPTLMHeadModel", "TFCTRLLMHeadModel", ] def __call__( self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): if self.model.__class__.__name__ not in self.ALLOWED_MODELS: raise NotImplementedError( "Generation is currently not supported for {}. Please select a model from {} for generation.".format( self.model.__class__.__name__, self.ALLOWED_MODELS ) ) text_inputs = self._args_parser(*args) results = [] for prompt_text in text_inputs: # Manage correct placement of the tensors with self.device_placement(): if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]: inputs = self._parse_and_tokenize( self.PADDING_TEXT + prompt_text, pad_to_max_length=False, add_special_tokens=False ) else: inputs = self._parse_and_tokenize(prompt_text, pad_to_max_length=False, add_special_tokens=False) # set input_ids to None to allow empty prompt if inputs["input_ids"].shape[-1] == 0: inputs["input_ids"] = None inputs["attention_mask"] = None if self.framework == "pt" and inputs["input_ids"] is not None: inputs = self.ensure_tensor_on_device(**inputs) input_ids = inputs["input_ids"] # Ensure that batch size = 1 (batch generation not allowed for now) assert ( input_ids is None or input_ids.shape[0] == 1 ), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information." output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs) # BS x SL result = [] for generated_sequence in output_sequences: generated_sequence = generated_sequence.numpy().tolist() record = {} if return_tensors: record["generated_token_ids"] = generated_sequence if return_text: # Decode text text = self.tokenizer.decode( generated_sequence, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used if input_ids is None: prompt_length = 0 else: prompt_length = len( self.tokenizer.decode( input_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) ) record["generated_text"] = prompt_text + text[prompt_length:] result.append(record) results += [result] if len(results) == 1: return results[0] return results class TextClassificationPipeline(Pipeline): """ Text classification pipeline using ModelForSequenceClassification head. See the `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information. This text classification pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "sentiment-analysis", for classifying sequences according to positive or negative sentiments. The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __call__(self, *args, **kwargs): outputs = super().__call__(*args, **kwargs) scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) return [{"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores] class FillMaskPipeline(Pipeline): """ Masked language modeling prediction pipeline using ModelWithLMHead head. See the `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information. This mask filling pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "fill-mask", for predicting masked tokens in a sequence. The models that this pipeline can use are models that have been trained with a masked language modeling objective, which includes the bi-directional models in the library. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, topk=5, task: str = "", ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=args_parser, device=device, binary_output=True, task=task, ) self.topk = topk def __call__(self, *args, **kwargs): inputs = self._parse_and_tokenize(*args, **kwargs) outputs = self._forward(inputs, return_tensors=True) results = [] batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0) for i in range(batch_size): input_ids = inputs["input_ids"][i] result = [] if self.framework == "tf": masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item() logits = outputs[i, masked_index, :] probs = tf.nn.softmax(logits) topk = tf.math.top_k(probs, k=self.topk) values, predictions = topk.values.numpy(), topk.indices.numpy() else: masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item() logits = outputs[i, masked_index, :] probs = logits.softmax(dim=0) values, predictions = probs.topk(self.topk) for v, p in zip(values.tolist(), predictions.tolist()): tokens = input_ids.numpy() tokens[masked_index] = p # Filter padding out: tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] result.append({"sequence": self.tokenizer.decode(tokens), "score": v, "token": p}) # Append results += [result] if len(results) == 1: return results[0] return results class NerPipeline(Pipeline): """ Named Entity Recognition pipeline using ModelForTokenClassification head. See the `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information. This token recognition pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous. The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ default_input_names = "sequences" def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False, ignore_labels=["O"], task: str = "", grouped_entities: bool = False, ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=args_parser, device=device, binary_output=binary_output, task=task, ) self._basic_tokenizer = BasicTokenizer(do_lower_case=False) self.ignore_labels = ignore_labels self.grouped_entities = grouped_entities def __call__(self, *args, **kwargs): inputs = self._args_parser(*args, **kwargs) answers = [] for sentence in inputs: # Manage correct placement of the tensors with self.device_placement(): tokens = self.tokenizer.encode_plus( sentence, return_attention_mask=False, return_tensors=self.framework, max_length=self.tokenizer.max_len, ) # Forward if self.framework == "tf": entities = self.model(tokens.data)[0][0].numpy() input_ids = tokens["input_ids"].numpy()[0] else: with torch.no_grad(): tokens = self.ensure_tensor_on_device(**tokens) entities = self.model(**tokens)[0][0].cpu().numpy() input_ids = tokens["input_ids"].cpu().numpy()[0] score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) labels_idx = score.argmax(axis=-1) entities = [] entity_groups = [] entity_group_disagg = [] # Filter to labels not in `self.ignore_labels` filtered_labels_idx = [ (idx, label_idx) for idx, label_idx in enumerate(labels_idx) if self.model.config.id2label[label_idx] not in self.ignore_labels ] for idx, label_idx in filtered_labels_idx: entity = { "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])), "score": score[idx][label_idx].item(), "entity": self.model.config.id2label[label_idx], "index": idx, } last_idx, _ = filtered_labels_idx[-1] if self.grouped_entities: if not entity_group_disagg: entity_group_disagg += [entity] if idx == last_idx: entity_groups += [self.group_entities(entity_group_disagg)] continue # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group if ( entity["entity"] == entity_group_disagg[-1]["entity"] and entity["index"] == entity_group_disagg[-1]["index"] + 1 ): entity_group_disagg += [entity] # Group the entities at the last entity if idx == last_idx: entity_groups += [self.group_entities(entity_group_disagg)] # If the current entity is different from the previous entity, aggregate the disaggregated entity group else: entity_groups += [self.group_entities(entity_group_disagg)] entity_group_disagg = [entity] entities += [entity] # Append if self.grouped_entities: answers += [entity_groups] else: answers += [entities] if len(answers) == 1: return answers[0] return answers def group_entities(self, entities): """ Returns grouped entities """ # Get the last entity in the entity group entity = entities[-1]["entity"] scores = np.mean([entity["score"] for entity in entities]) tokens = [entity["word"] for entity in entities] entity_group = { "entity_group": entity, "score": np.mean(scores), "word": self.tokenizer.convert_tokens_to_string(tokens), } return entity_group TokenClassificationPipeline = NerPipeline class QuestionAnsweringArgumentHandler(ArgumentHandler): """ QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to internal SquadExample / SquadFeature structures. QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied arguments. """ def __call__(self, *args, **kwargs): # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating if args is not None and len(args) > 0: if len(args) == 1: kwargs["X"] = args[0] else: kwargs["X"] = list(args) # Generic compatibility with sklearn and Keras # Batched data if "X" in kwargs or "data" in kwargs: inputs = kwargs["X"] if "X" in kwargs else kwargs["data"] if isinstance(inputs, dict): inputs = [inputs] else: # Copy to avoid overriding arguments inputs = [i for i in inputs] for i, item in enumerate(inputs): if isinstance(item, dict): if any(k not in item for k in ["question", "context"]): raise KeyError("You need to provide a dictionary with keys {question:..., context:...}") inputs[i] = QuestionAnsweringPipeline.create_sample(**item) elif not isinstance(item, SquadExample): raise ValueError( "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format( "X" if "X" in kwargs else "data" ) ) # Tabular input elif "question" in kwargs and "context" in kwargs: if isinstance(kwargs["question"], str): kwargs["question"] = [kwargs["question"]] if isinstance(kwargs["context"], str): kwargs["context"] = [kwargs["context"]] inputs = [ QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"]) ] else: raise ValueError("Unknown arguments {}".format(kwargs)) if not isinstance(inputs, list): inputs = [inputs] return inputs class QuestionAnsweringPipeline(Pipeline): """ Question Answering pipeline using ModelForQuestionAnswering head. See the `question answering usage <../usage.html#question-answering>`__ examples for more information. This question answering can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "question-answering", for answering questions given a context. The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ default_input_names = "question,context" def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, device: int = -1, task: str = "", **kwargs ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=QuestionAnsweringArgumentHandler(), device=device, task=task, **kwargs, ) @staticmethod def create_sample( question: Union[str, List[str]], context: Union[str, List[str]] ) -> Union[SquadExample, List[SquadExample]]: """ QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). We currently support extractive question answering. Arguments: question: (str, List[str]) The question to be ask for the associated context context: (str, List[str]) The context in which we will look for the answer. Returns: SquadExample initialized with the corresponding question and context. """ if isinstance(question, list): return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] else: return SquadExample(None, question, context, None, None, None) def __call__(self, *args, **kwargs): """ Args: We support multiple use-cases, the following are exclusive: X: sequence of SquadExample data: sequence of SquadExample question: (str, List[str]), batch of question(s) to map along with context context: (str, List[str]), batch of context(s) associated with the provided question keyword argument Returns: dict: {'answer': str, 'score": float, 'start": int, "end": int} answer: the textual answer in the intial context score: the score the current answer scored for the model start: the character index in the original string corresponding to the beginning of the answer' span end: the character index in the original string corresponding to the ending of the answer' span """ # Set defaults values kwargs.setdefault("topk", 1) kwargs.setdefault("doc_stride", 128) kwargs.setdefault("max_answer_len", 15) kwargs.setdefault("max_seq_len", 384) kwargs.setdefault("max_question_len", 64) kwargs.setdefault("handle_impossible_answer", False) if kwargs["topk"] < 1: raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"])) if kwargs["max_answer_len"] < 1: raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"])) # Convert inputs to features examples = self._args_parser(*args, **kwargs) features_list = [ squad_convert_examples_to_features( [example], self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False, tqdm_enabled=False, ) for example in examples ] all_answers = [] for features, example in zip(features_list, examples): model_input_names = self.tokenizer.model_input_names + ["input_ids"] fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names} # Manage tensor allocation on correct device with self.device_placement(): if self.framework == "tf": fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} start, end = self.model(fw_args) start, end = start.numpy(), end.numpy() else: with torch.no_grad(): # Retrieve the score for the context tokens only (removing question tokens) fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()} start, end = self.model(**fw_args) start, end = start.cpu().numpy(), end.cpu().numpy() min_null_score = 1000000 # large and positive answers = [] for (feature, start_, end_) in zip(features, start, end): # Normalize logits and spans to retrieve the answer start_ = np.exp(start_) / np.sum(np.exp(start_)) end_ = np.exp(end_) / np.sum(np.exp(end_)) # Mask padding and question start_, end_ = ( start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1), ) if kwargs["handle_impossible_answer"]: min_null_score = min(min_null_score, (start_[0] * end_[0]).item()) start_[0] = end_[0] = 0 starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text answers += [ { "score": score.item(), "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), "answer": " ".join( example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] ), } for s, e, score in zip(starts, ends, scores) ] if kwargs["handle_impossible_answer"]: answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]] all_answers += answers if len(all_answers) == 1: return all_answers[0] return all_answers def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: """ Take the output of any QuestionAnswering head and will generate probalities for each span to be the actual answer. In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or answer end position being before the starting position. The method supports output the k-best answer through the topk argument. Args: start: numpy array, holding individual start probabilities for each token end: numpy array, holding individual end probabilities for each token topk: int, indicates how many possible answer span(s) to extract from the model's output max_answer_len: int, maximum size of the answer to extract from the model's output """ # Ensure we have batch axis if start.ndim == 1: start = start[None] if end.ndim == 1: end = end[None] # Compute the score of each tuple(start, end) to be the real answer outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) # Remove candidate with end < start and end - start > max_answer_len candidates = np.tril(np.triu(outer), max_answer_len - 1) # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) scores_flat = candidates.flatten() if topk == 1: idx_sort = [np.argmax(scores_flat)] elif len(scores_flat) < topk: idx_sort = np.argsort(-scores_flat) else: idx = np.argpartition(-scores_flat, topk)[0:topk] idx_sort = idx[np.argsort(-scores_flat[idx])] start, end = np.unravel_index(idx_sort, candidates.shape)[1:] return start, end, candidates[0, start, end] def span_to_answer(self, text: str, start: int, end: int): """ When decoding from token probalities, this method maps token indexes to actual word in the initial context. Args: text: str, the actual context to extract the answer from start: int, starting answer token index end: int, ending answer token index Returns: dict: {'answer': str, 'start': int, 'end': int} """ words = [] token_idx = char_start_idx = char_end_idx = chars_idx = 0 for i, word in enumerate(text.split(" ")): token = self.tokenizer.tokenize(word) # Append words if they are in the span if start <= token_idx <= end: if token_idx == start: char_start_idx = chars_idx if token_idx == end: char_end_idx = chars_idx + len(word) words += [word] # Stop if we went over the end of the answer if token_idx > end: break # Append the subtokenization length to the running index token_idx += len(token) chars_idx += len(word) + 1 # Join text with spaces return { "answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx), } class SummarizationPipeline(Pipeline): """ Summarize news articles and other documents Usage:: # use bart in pytorch summarizer = pipeline("summarization") summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20) # use t5 in tf summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf") summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20) The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`str` or :obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. If :obj:`None`, the default of the pipeline will be loaded. tokenizer (:obj:`str` or :obj:`~transformers1.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained tokenizer inheriting from :class:`~transformers1.PreTrainedTokenizer`. If :obj:`None`, the default of the pipeline will be loaded. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __call__( self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): r""" Args: *documents: (list of strings) articles to be summarized return_text: (bool, default=True) whether to add a decoded "summary_text" to each result return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output **generate_kwargs: extra kwargs passed to `self.model.generate`_ Returns: list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize .. _`self.model.generate`: https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate """ assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" assert len(documents) > 0, "Please provide a document to summarize" if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__: raise NotImplementedError( "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`" ) prefix = self.model.config.prefix if self.model.config.prefix is not None else "" if isinstance(documents[0], list): assert ( self.tokenizer.pad_token_id is not None ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" documents = ([prefix + document for document in documents[0]],) pad_to_max_length = True elif isinstance(documents[0], str): documents = (prefix + documents[0],) pad_to_max_length = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( documents[0] ) ) with self.device_placement(): inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) input_length = inputs["input_ids"].shape[-1] elif self.framework == "tf": input_length = tf.shape(inputs["input_ids"])[-1].numpy() min_length = generate_kwargs.get("min_length", self.model.config.min_length) if input_length < min_length // 2: logger.warning( "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format( min_length, input_length ) ) max_length = generate_kwargs.get("max_length", self.model.config.max_length) if input_length < max_length: logger.warning( "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format( max_length, input_length ) ) summaries = self.model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs, ) results = [] for summary in summaries: record = {} if return_tensors: record["summary_token_ids"] = summary if return_text: record["summary_text"] = self.tokenizer.decode( summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) results.append(record) return results class TranslationPipeline(Pipeline): """ Translates from one language to another. Usage:: en_fr_translator = pipeline("translation_en_to_fr") en_fr_translator("How old are you?") The models that this pipeline can use are models that have been fine-tuned on a translation task, currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`str` or :obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. If :obj:`None`, the default of the pipeline will be loaded. tokenizer (:obj:`str` or :obj:`~transformers1.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained tokenizer inheriting from :class:`~transformers1.PreTrainedTokenizer`. If :obj:`None`, the default of the pipeline will be loaded. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __call__( self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): r""" Args: *args: (list of strings) texts to be translated return_text: (bool, default=True) whether to add a decoded "translation_text" to each result return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result **generate_kwargs: extra kwargs passed to `self.model.generate`_ Returns: list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate .. _`self.model.generate`: https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate """ assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" prefix = self.model.config.prefix if self.model.config.prefix is not None else "" if isinstance(args[0], list): assert ( self.tokenizer.pad_token_id is not None ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" args = ([prefix + text for text in args[0]],) pad_to_max_length = True elif isinstance(args[0], str): args = (prefix + args[0],) pad_to_max_length = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( args[0] ) ) with self.device_placement(): inputs = self._parse_and_tokenize(*args, pad_to_max_length=pad_to_max_length) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) input_length = inputs["input_ids"].shape[-1] elif self.framework == "tf": input_length = tf.shape(inputs["input_ids"])[-1].numpy() max_length = generate_kwargs.get("max_length", self.model.config.max_length) if input_length > 0.9 * max_length: logger.warning( "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format( input_length, max_length ) ) translations = self.model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs, ) results = [] for translation in translations: record = {} if return_tensors: record["translation_token_ids"] = translation if return_text: record["translation_text"] = self.tokenizer.decode( translation, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) results.append(record) return results # Register all the supported tasks here SUPPORTED_TASKS = { "feature-extraction": { "impl": FeatureExtractionPipeline, "tf": TFAutoModel if is_tf_available() else None, "pt": AutoModel if is_torch_available() else None, "default": { "model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}, "config": None, "tokenizer": "distilbert-base-cased", }, }, "sentiment-analysis": { "impl": TextClassificationPipeline, "tf": TFAutoModelForSequenceClassification if is_tf_available() else None, "pt": AutoModelForSequenceClassification if is_torch_available() else None, "default": { "model": { "pt": "distilbert-base-uncased-finetuned-sst-2-english", "tf": "distilbert-base-uncased-finetuned-sst-2-english", }, "config": "distilbert-base-uncased-finetuned-sst-2-english", "tokenizer": "distilbert-base-uncased", }, }, "ner": { "impl": NerPipeline, "tf": TFAutoModelForTokenClassification if is_tf_available() else None, "pt": AutoModelForTokenClassification if is_torch_available() else None, "default": { "model": { "pt": "dbmdz/bert-large-cased-finetuned-conll03-english", "tf": "dbmdz/bert-large-cased-finetuned-conll03-english", }, "config": "dbmdz/bert-large-cased-finetuned-conll03-english", "tokenizer": "bert-large-cased", }, }, "question-answering": { "impl": QuestionAnsweringPipeline, "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None, "pt": AutoModelForQuestionAnswering if is_torch_available() else None, "default": { "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"}, "config": None, "tokenizer": ("distilbert-base-cased", {"use_fast": False}), }, }, "fill-mask": { "impl": FillMaskPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}, "config": None, "tokenizer": ("distilroberta-base", {"use_fast": False}), }, }, "summarization": { "impl": SummarizationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": {"model": {"pt": "facebook/bart-large-cnn", "tf": "t5-small"}, "config": None, "tokenizer": None}, }, "translation_en_to_fr": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "t5-base", "tf": "t5-base"}, "config": None, "tokenizer": ("t5-base", {"use_fast": False}), }, }, "translation_en_to_de": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "t5-base", "tf": "t5-base"}, "config": None, "tokenizer": ("t5-base", {"use_fast": False}), }, }, "translation_en_to_ro": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "t5-base", "tf": "t5-base"}, "config": None, "tokenizer": ("t5-base", {"use_fast": False}), }, }, "text-generation": { "impl": TextGenerationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": {"model": {"pt": "gpt2", "tf": "gpt2"}, "config": None, "tokenizer": "gpt2"}, }, } def pipeline( task: str, model: Optional = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, framework: Optional[str] = None, **kwargs ) -> Pipeline: """ Utility factory method to build a pipeline. Pipeline are made of: - A Tokenizer instance in charge of mapping raw textual input to token - A Model instance - Some (optional) post processing for enhancing model's output Args: task (:obj:`str`): The task defining which pipeline will be returned. Currently accepted tasks are: - "feature-extraction": will return a :class:`~transformers1.FeatureExtractionPipeline` - "sentiment-analysis": will return a :class:`~transformers1.TextClassificationPipeline` - "ner": will return a :class:`~transformers1.NerPipeline` - "question-answering": will return a :class:`~transformers1.QuestionAnsweringPipeline` - "fill-mask": will return a :class:`~transformers1.FillMaskPipeline` - "summarization": will return a :class:`~transformers1.SummarizationPipeline` - "translation_xx_to_yy": will return a :class:`~transformers1.TranslationPipeline` model (:obj:`str` or :obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a model identifier or an actual pre-trained model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. If :obj:`None`, the default for this pipeline will be loaded. config (:obj:`str` or :obj:`~transformers1.PretrainedConfig`, `optional`, defaults to :obj:`None`): The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`, a model identifier or an actual pre-trained model configuration inheriting from :class:`~transformers1.PretrainedConfig`. If :obj:`None`, the default for this pipeline will be loaded. tokenizer (:obj:`str` or :obj:`~transformers1.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, a model identifier or an actual pre-trained tokenizer inheriting from :class:`~transformers1.PreTrainedTokenizer`. If :obj:`None`, the default for this pipeline will be loaded. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. Returns: :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers1.Pipeline`, according to the task. Examples:: from transformers1 import pipeline, AutoModelForTokenClassification, AutoTokenizer # Sentiment analysis pipeline pipeline('sentiment-analysis') # Question answering pipeline, specifying the checkpoint identifier pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased') # Named entity recognition pipeline, passing in a specific model and tokenizer model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") pipeline('ner', model=model, tokenizer=tokenizer) """ # Retrieve the task if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) framework = framework or get_framework(model) targeted_task = SUPPORTED_TASKS[task] task_class, model_class = targeted_task["impl"], targeted_task[framework] # Use default model/config/tokenizer for the task if no model is provided if model is None: models, config, tokenizer = [targeted_task["default"][k] for k in ["model", "config", "tokenizer"]] model = models[framework] # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = model elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = config else: # Impossible to guest what is the right tokenizer here raise Exception( "Impossible to guess which tokenizer to use. " "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer." ) modelcard = None # Try to infer modelcard from model or config name (if provided as str) if isinstance(model, str): modelcard = model elif isinstance(config, str): modelcard = config # Instantiate tokenizer if needed if isinstance(tokenizer, (str, tuple)): if isinstance(tokenizer, tuple): # For tuple we have (tokenizer name, {kwargs}) tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1]) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer) # Instantiate config if needed if isinstance(config, str): config = AutoConfig.from_pretrained(config) # Instantiate modelcard if needed if isinstance(modelcard, str): modelcard = ModelCard.from_pretrained(modelcard) # Instantiate model if needed if isinstance(model, str): # Handle transparent TF/PT model conversion model_kwargs = {} if framework == "pt" and model.endswith(".h5"): model_kwargs["from_tf"] = True logger.warning( "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " "Trying to load the model with PyTorch." ) elif framework == "tf" and model.endswith(".bin"): model_kwargs["from_pt"] = True logger.warning( "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Trying to load the model with Tensorflow." ) model = model_class.from_pretrained(model, config=config, **model_kwargs) return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_albert.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for ALBERT model.""" import logging import os import unicodedata from shutil import copyfile from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-spiece.model", "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-spiece.model", "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-spiece.model", "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-spiece.model", "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model", "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model", "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model", "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "albert-base-v1": 512, "albert-large-v1": 512, "albert-xlarge-v1": 512, "albert-xxlarge-v1": 512, "albert-base-v2": 512, "albert-large-v2": 512, "albert-xlarge-v2": 512, "albert-xxlarge-v2": 512, } SPIECE_UNDERLINE = "▁" class AlbertTokenizer(PreTrainedTokenizer): """ Constructs an ALBERT tokenizer. Based on `SentencePiece `__ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a .spm extension) that contains the vocabulary necessary to instantiate a tokenizer. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to strip the text when tokenizing (removing excess spaces before and after the string). keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to keep accents when tokenizing. bos_token (:obj:`string`, `optional`, defaults to "[CLS]"): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to "[SEP]"): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=True, remove_space=True, keep_accents=False, bos_token="[CLS]", eos_token="[SEP]", unk_token="", sep_token="[SEP]", pad_token="", cls_token="[CLS]", mask_token="[MASK]", **kwargs ): super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return len(self.sp_model) def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): if self.remove_space: outputs = " ".join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if not self.keep_accents: outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if self.do_lower_case: outputs = outputs.lower() return outputs def _tokenize(self, text, sample=False): """ Tokenize a string. """ text = self.preprocess_text(text) if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: cur_pieces = cur_pieces[1:] else: cur_pieces[0] = cur_pieces[0][1:] cur_pieces.append(piece[-1]) new_pieces.extend(cur_pieces) else: new_pieces.append(piece) return new_pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An ALBERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return cls + token_ids_0 + sep return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Tokenizer class. """ import logging from collections import OrderedDict from .configuration_auto import ( AlbertConfig, AutoConfig, BartConfig, BertConfig, CamembertConfig, CTRLConfig, DistilBertConfig, ElectraConfig, FlaubertConfig, GPT2Config, LongformerConfig, OpenAIGPTConfig, ReformerConfig, RobertaConfig, T5Config, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLNetConfig, ) from .configuration_marian import MarianConfig from .configuration_utils import PretrainedConfig from .tokenization_albert import AlbertTokenizer from .tokenization_bart import BartTokenizer from .tokenization_bert import BertTokenizer, BertTokenizerFast from .tokenization_bert_japanese import BertJapaneseTokenizer from .tokenization_camembert import CamembertTokenizer from .tokenization_ctrl import CTRLTokenizer from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast from .tokenization_flaubert import FlaubertTokenizer from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_longformer import LongformerTokenizer from .tokenization_marian import MarianTokenizer from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_reformer import ReformerTokenizer from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_t5 import T5Tokenizer from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLTokenizerFast from .tokenization_xlm import XLMTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer from .tokenization_xlnet import XLNetTokenizer logger = logging.getLogger(__name__) TOKENIZER_MAPPING = OrderedDict( [ (T5Config, (T5Tokenizer, None)), (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)), (AlbertConfig, (AlbertTokenizer, None)), (CamembertConfig, (CamembertTokenizer, None)), (XLMRobertaConfig, (XLMRobertaTokenizer, None)), (MarianConfig, (MarianTokenizer, None)), (BartConfig, (BartTokenizer, None)), (LongformerConfig, (LongformerTokenizer, None)), (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)), (ReformerConfig, (ReformerTokenizer, None)), (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)), (BertConfig, (BertTokenizer, BertTokenizerFast)), (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)), (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)), (TransfoXLConfig, (TransfoXLTokenizer, TransfoXLTokenizerFast)), (XLNetConfig, (XLNetTokenizer, None)), (FlaubertConfig, (FlaubertTokenizer, None)), (XLMConfig, (XLMTokenizer, None)), (CTRLConfig, (CTRLTokenizer, None)), ] ) class AutoTokenizer: r""":class:`~transformers1.AutoTokenizer` is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct tokenizer class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: T5Tokenizer (T5 model) - `distilbert`: DistilBertTokenizer (DistilBert model) - `albert`: AlbertTokenizer (ALBERT model) - `camembert`: CamembertTokenizer (CamemBERT model) - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model) - `longformer`: LongformerTokenizer (AllenAI Longformer model) - `roberta`: RobertaTokenizer (RoBERTa model) - `bert`: BertTokenizer (Bert model) - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - `xlnet`: XLNetTokenizer (XLNet model) - `xlm`: XLMTokenizer (XLM model) - `ctrl`: CTRLTokenizer (Salesforce CTRL model) - `electra`: ElectraTokenizer (Google ELECTRA model) This class cannot be instantiated using `__init__()` (throw an error). """ def __init__(self): raise EnvironmentError( "AutoTokenizer is designed to be instantiated " "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method." ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): r""" Instantiate one of the tokenizer classes of the library from a pre-trained model vocabulary. The tokenizer class to instantiate is selected based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: T5Tokenizer (T5 model) - `distilbert`: DistilBertTokenizer (DistilBert model) - `albert`: AlbertTokenizer (ALBERT model) - `camembert`: CamembertTokenizer (CamemBERT model) - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model) - `longformer`: LongformerTokenizer (AllenAI Longformer model) - `roberta`: RobertaTokenizer (RoBERTa model) - `bert-base-japanese`: BertJapaneseTokenizer (Bert model) - `bert`: BertTokenizer (Bert model) - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - `xlnet`: XLNetTokenizer (XLNet model) - `xlm`: XLMTokenizer (XLM model) - `ctrl`: CTRLTokenizer (Salesforce CTRL model) - `electra`: ElectraTokenizer (Google ELECTRA model) Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers1.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. cache_dir: (`optional`) string: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the vocabulary files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. use_fast: (`optional`) boolean, default False: Indicate if transformers1 should try to load the fast version of the tokenizer (True) or use the Python one (False). inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers1.PreTrainedTokenizer` for details. Examples:: # Download vocabulary from S3 and cache. tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 (user-uploaded) and cache. tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased') # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) if "bert-base-japanese" in pretrained_model_name_or_path: return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) use_fast = kwargs.pop("use_fast", False) for config_class, (tokenizer_class_py, tokenizer_class_fast) in TOKENIZER_MAPPING.items(): if isinstance(config, config_class): if tokenizer_class_fast and use_fast: return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) else: return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) raise ValueError( "Unrecognized configuration class {} to build an AutoTokenizer.\n" "Model type should be one of {}.".format( config.__class__, ", ".join(c.__name__ for c in TOKENIZER_MAPPING.keys()) ) ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_bart.py ================================================ # coding=utf-8 # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from .tokenization_roberta import RobertaTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer logger = logging.getLogger(__name__) # vocab and merges same as roberta vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" _all_bart_models = [ "facebook/bart-large", "facebook/bart-large-mnli", "facebook/bart-large-cnn", "facebook/bart-large-xsum", ] class BartTokenizer(RobertaTokenizer): # merges and vocab same as Roberta max_model_input_sizes = {m: 1024 for m in _all_bart_models} pretrained_vocab_files_map = { "vocab_file": {m: vocab_url for m in _all_bart_models}, "merges_file": {m: merges_url for m in _all_bart_models}, } _all_mbart_models = ["facebook/mbart-large-en-ro"] SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model" class MBartTokenizer(XLMRobertaTokenizer): vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} max_model_input_sizes = {m: 1024 for m in _all_mbart_models} pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes.""" import collections import logging import os import unicodedata from typing import List, Optional from tokenizers import BertWordPieceTokenizer from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "bert-base-uncased": 512, "bert-large-uncased": 512, "bert-base-cased": 512, "bert-large-cased": 512, "bert-base-multilingual-uncased": 512, "bert-base-multilingual-cased": 512, "bert-base-chinese": 512, "bert-base-german-cased": 512, "bert-large-uncased-whole-word-masking": 512, "bert-large-cased-whole-word-masking": 512, "bert-large-uncased-whole-word-masking-finetuned-squad": 512, "bert-large-cased-whole-word-masking-finetuned-squad": 512, "bert-base-cased-finetuned-mrpc": 512, "bert-base-german-dbmdz-cased": 512, "bert-base-german-dbmdz-uncased": 512, "TurkuNLP/bert-base-finnish-cased-v1": 512, "TurkuNLP/bert-base-finnish-uncased-v1": 512, "wietsedv/bert-base-dutch-cased": 512, } PRETRAINED_INIT_CONFIGURATION = { "bert-base-uncased": {"do_lower_case": True}, "bert-large-uncased": {"do_lower_case": True}, "bert-base-cased": {"do_lower_case": False}, "bert-large-cased": {"do_lower_case": False}, "bert-base-multilingual-uncased": {"do_lower_case": True}, "bert-base-multilingual-cased": {"do_lower_case": False}, "bert-base-chinese": {"do_lower_case": False}, "bert-base-german-cased": {"do_lower_case": False}, "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, "bert-large-cased-whole-word-masking": {"do_lower_case": False}, "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, "bert-base-german-dbmdz-cased": {"do_lower_case": False}, "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False}, "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True}, "wietsedv/bert-base-dutch-cased": {"do_lower_case": False}, } def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() with open(vocab_file, "r", encoding="utf-8") as reader: tokens = reader.readlines() for index, token in enumerate(tokens): token = token.rstrip("\n") vocab[token] = index return vocab def whitespace_tokenize(text): """Runs basic whitespace cleaning and splitting on a piece of text.""" text = text.strip() if not text: return [] tokens = text.split() return tokens class BertTokenizer(PreTrainedTokenizer): r""" Constructs a BERT tokenizer. Based on WordPiece. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): File containing the vocabulary. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to do basic tokenization before WordPiece. never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): List of tokens which will never be split during tokenization. Only has an effect when :obj:`do_basic_tokenize=True` unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs ): super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) ) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) @property def vocab_size(self): return len(self.vocab) def get_vocab(self): return dict(self.vocab, **self.added_tokens_encoder) def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) else: split_tokens = self.wordpiece_tokenizer.tokenize(text) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = " ".join(tokens).replace(" ##", "").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0's). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, vocab_path): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: vocab_path (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!".format(vocab_file) ) index = token_index writer.write(token + "\n") index += 1 return (vocab_file,) class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True): """ Constructs a BasicTokenizer. Args: **do_lower_case**: Whether to lower case the input. **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. **tokenize_chinese_chars**: (`optional`) boolean (default True) Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 """ if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = never_split self.tokenize_chinese_chars = tokenize_chinese_chars def tokenize(self, text, never_split=None): """ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. Args: **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. """ never_split = self.never_split + (never_split if never_split is not None else []) text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: if self.do_lower_case and token not in never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token, never_split)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens def _run_strip_accents(self, text): """Strips accents from a piece of text.""" text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output) def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" if never_split is not None and text in never_split: return [text] chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output] def _tokenize_chinese_chars(self, text): """Adds whitespace around any CJK character.""" output = [] for char in text: cp = ord(char) if self._is_chinese_char(cp): output.append(" ") output.append(char) output.append(" ") else: output.append(char) return "".join(output) def _is_chinese_char(self, cp): """Checks whether CP is the codepoint of a CJK character.""" # This defines a "chinese character" as anything in the CJK Unicode block: # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) # # Note that the CJK Unicode block is NOT all Japanese and Korean characters, # despite its name. The modern Korean Hangul alphabet is a different block, # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. if ( (cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) # or (cp >= 0x20000 and cp <= 0x2A6DF) # or (cp >= 0x2A700 and cp <= 0x2B73F) # or (cp >= 0x2B740 and cp <= 0x2B81F) # or (cp >= 0x2B820 and cp <= 0x2CEAF) # or (cp >= 0xF900 and cp <= 0xFAFF) or (cp >= 0x2F800 and cp <= 0x2FA1F) # ): # return True return False def _clean_text(self, text): """Performs invalid character removal and whitespace cleanup on text.""" output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xFFFD or _is_control(char): continue if _is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) class WordpieceTokenizer(object): """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): self.vocab = vocab self.unk_token = unk_token self.max_input_chars_per_word = max_input_chars_per_word def tokenize(self, text): """Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. For example: input = "unaffable" output = ["un", "##aff", "##able"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`. Returns: A list of wordpiece tokens. """ output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) if len(chars) > self.max_input_chars_per_word: output_tokens.append(self.unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < len(chars): end = len(chars) cur_substr = None while start < end: substr = "".join(chars[start:end]) if start > 0: substr = "##" + substr if substr in self.vocab: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) return output_tokens def _is_whitespace(char): """Checks whether `chars` is a whitespace character.""" # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def _is_punctuation(char): """Checks whether `chars` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False class BertTokenizerFast(PreTrainedTokenizerFast): r""" Constructs a "Fast" BERT tokenizer (backed by HuggingFace's `tokenizers` library). Bert tokenization is Based on WordPiece. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): File containing the vocabulary. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to clean the text before tokenization by removing any control characters and replacing all whitespaces by the classic one. tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=True, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", clean_text=True, tokenize_chinese_chars=True, strip_accents=True, wordpieces_prefix="##", **kwargs ): super().__init__( BertWordPieceTokenizer( vocab_file=vocab_file, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, clean_text=clean_text, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, wordpieces_prefix=wordpieces_prefix, ), unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) self.do_lower_case = do_lower_case def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] if token_ids_1: output += token_ids_1 + [self.sep_token_id] return output def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0's). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_bert_japanese.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes.""" import collections import logging import os import unicodedata from typing import Optional from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "cl-tohoku/bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/vocab.txt", "cl-tohoku/bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/vocab.txt", "cl-tohoku/bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/vocab.txt", "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "cl-tohoku/bert-base-japanese": 512, "cl-tohoku/bert-base-japanese-whole-word-masking": 512, "cl-tohoku/bert-base-japanese-char": 512, "cl-tohoku/bert-base-japanese-char-whole-word-masking": 512, } PRETRAINED_INIT_CONFIGURATION = { "cl-tohoku/bert-base-japanese": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", }, "cl-tohoku/bert-base-japanese-whole-word-masking": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", }, "cl-tohoku/bert-base-japanese-char": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "character", }, "cl-tohoku/bert-base-japanese-char-whole-word-masking": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "character", }, } class BertJapaneseTokenizer(BertTokenizer): """BERT tokenizer for Japanese text""" vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=False, do_word_tokenize=True, do_subword_tokenize=True, word_tokenizer_type="basic", subword_tokenizer_type="wordpiece", never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", mecab_kwargs=None, **kwargs ): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_word_tokenize**: (`optional`) boolean (default True) Whether to do word tokenization. **do_subword_tokenize**: (`optional`) boolean (default True) Whether to do subword tokenization. **word_tokenizer_type**: (`optional`) string (default "basic") Type of word tokenizer. **subword_tokenizer_type**: (`optional`) string (default "wordpiece") Type of subword tokenizer. **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None) """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) # ^^ We call the grandparent's init, not the parent's. if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) ) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_word_tokenize = do_word_tokenize if do_word_tokenize: if word_tokenizer_type == "basic": self.word_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False ) elif word_tokenizer_type == "mecab": self.word_tokenizer = MecabTokenizer( do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {}) ) else: raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type)) self.do_subword_tokenize = do_subword_tokenize if do_subword_tokenize: if subword_tokenizer_type == "wordpiece": self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) elif subword_tokenizer_type == "character": self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token) else: raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) def _tokenize(self, text): if self.do_word_tokenize: tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens) else: tokens = [text] if self.do_subword_tokenize: split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)] else: split_tokens = tokens return split_tokens class MecabTokenizer: """Runs basic tokenization with MeCab morphological parser.""" def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None): """Constructs a MecabTokenizer. Args: **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. **normalize_text**: (`optional`) boolean (default True) Whether to apply unicode normalization to text before tokenization. **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "") """ self.do_lower_case = do_lower_case self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text import MeCab self.mecab = MeCab.Tagger(mecab_option) if mecab_option is not None else MeCab.Tagger() def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" if self.normalize_text: text = unicodedata.normalize("NFKC", text) never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] mecab_output = self.mecab.parse(text) cursor = 0 for line in mecab_output.split("\n"): if line == "EOS": break token, _ = line.split("\t") token_start = text.index(token, cursor) token_end = token_start + len(token) if self.do_lower_case and token not in never_split: token = token.lower() tokens.append(token) cursor = token_end return tokens class CharacterTokenizer(object): """Runs Character tokenziation.""" def __init__(self, vocab, unk_token, normalize_text=True): """Constructs a CharacterTokenizer. Args: **vocab**: Vocabulary object. **unk_token**: str A special symbol for out-of-vocabulary token. **normalize_text**: (`optional`) boolean (default True) Whether to apply unicode normalization to text before tokenization. """ self.vocab = vocab self.unk_token = unk_token self.normalize_text = normalize_text def tokenize(self, text): """Tokenizes a piece of text into characters. For example: input = "apple" output = ["a", "p", "p", "l", "e"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`. Returns: A list of characters. """ if self.normalize_text: text = unicodedata.normalize("NFKC", text) output_tokens = [] for i, char in enumerate(text): if char not in self.vocab: output_tokens.append(self.unk_token) continue output_tokens.append(char) return output_tokens ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_camembert.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License """ Tokenization classes for Camembert model.""" import logging import os from shutil import copyfile from typing import List, Optional import sentencepiece as spm from .tokenization_utils import PreTrainedTokenizer from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "camembert-base": None, } SHARED_MODEL_IDENTIFIERS = [ # Load with # `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")` "Musixmatch/umberto-commoncrawl-cased-v1", "Musixmatch/umberto-wikipedia-uncased-v1", ] class CamembertTokenizer(PreTrainedTokenizer): """ Adapted from RobertaTokenizer and XLNetTokenizer SentencePiece based tokenizer. Peculiarities: - requires `SentencePiece `_ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", additional_special_tokens=["NOTUSED", "NOTUSED"], **kwargs ): super().__init__( max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs, ) self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual # sentencepiece vocabulary (this is the case for and self.fairseq_tokens_to_ids = {"NOTUSED": 0, "": 1, "NOTUSED": 2, "": 3} self.fairseq_offset = len(self.fairseq_tokens_to_ids) self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A CamemBERT sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] @property def vocab_size(self): return len(self.fairseq_tokens_to_ids) + len(self.sp_model) def _tokenize(self, text): return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] elif self.sp_model.PieceToId(token) == 0: # Convert sentence piece unk token to fairseq unk token index return self.unk_token_id return self.fairseq_offset + self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for Salesforce CTRL.""" import json import logging import os import regex as re from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"}, "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "ctrl": 256, } CONTROL_CODES = { "Pregnancy": 168629, "Christianity": 7675, "Explain": 106423, "Fitness": 63440, "Saving": 63163, "Ask": 27171, "Ass": 95985, "Joke": 163509, "Questions": 45622, "Thoughts": 49605, "Retail": 52342, "Feminism": 164338, "Writing": 11992, "Atheism": 192263, "Netflix": 48616, "Computing": 39639, "Opinion": 43213, "Alone": 44967, "Funny": 58917, "Gaming": 40358, "Human": 4088, "India": 1331, "Joker": 77138, "Diet": 36206, "Legal": 11859, "Norman": 4939, "Tip": 72689, "Weight": 52343, "Movies": 46273, "Running": 23425, "Science": 2090, "Horror": 37793, "Confession": 60572, "Finance": 12250, "Politics": 16360, "Scary": 191985, "Support": 12654, "Technologies": 32516, "Teenage": 66160, "Event": 32769, "Learned": 67460, "Notion": 182770, "Wikipedia": 37583, "Books": 6665, "Extract": 76050, "Confessions": 102701, "Conspiracy": 75932, "Links": 63674, "Narcissus": 150425, "Relationship": 54766, "Relationships": 134796, "Reviews": 41671, "News": 4256, "Translation": 26820, "multilingual": 128406, } def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char pairs = set(pairs) return pairs class CTRLTokenizer(PreTrainedTokenizer): """ Constructs a CTRL tokenizer. Peculiarities: - Byte-Pair-Encoding This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES control_codes = CONTROL_CODES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): super().__init__(unk_token=unk_token, **kwargs) with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: merges = merges_handle.read().split("\n")[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token) word = tuple(list(word[:-1]) + [word[-1] + ""]) pairs = get_pairs(word) if not pairs: return token while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = "@@ ".join(word) word = word[:-4] self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] words = re.findall(r"\S+\n?", text) for token in words: split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)) # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens) # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) # return ''.join(tokens_generated_so_far) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_distilbert.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for DistilBERT.""" import logging from .tokenization_bert import BertTokenizer, BertTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "distilbert-base-uncased": 512, "distilbert-base-uncased-distilled-squad": 512, "distilbert-base-cased": 512, "distilbert-base-cased-distilled-squad": 512, "distilbert-base-german-cased": 512, "distilbert-base-multilingual-cased": 512, } PRETRAINED_INIT_CONFIGURATION = { "distilbert-base-uncased": {"do_lower_case": True}, "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, "distilbert-base-cased": {"do_lower_case": False}, "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, "distilbert-base-german-cased": {"do_lower_case": False}, "distilbert-base-multilingual-cased": {"do_lower_case": False}, } class DistilBertTokenizer(BertTokenizer): r""" Constructs a DistilBertTokenizer. :class:`~transformers1.DistilBertTokenizer is identical to :class:`~transformers1.BertTokenizer` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizer` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION model_input_names = ["attention_mask"] class DistilBertTokenizerFast(BertTokenizerFast): r""" Constructs a "Fast" DistilBertTokenizer (backed by HuggingFace's `tokenizers` library). :class:`~transformers1.DistilBertTokenizerFast` is identical to :class:`~transformers1.BertTokenizerFast` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizerFast` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION model_input_names = ["attention_mask"] ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_electra.py ================================================ # coding=utf-8 # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .tokenization_bert import BertTokenizer, BertTokenizerFast VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt", "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt", "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt", "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt", "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt", "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/electra-small-generator": 512, "google/electra-base-generator": 512, "google/electra-large-generator": 512, "google/electra-small-discriminator": 512, "google/electra-base-discriminator": 512, "google/electra-large-discriminator": 512, } PRETRAINED_INIT_CONFIGURATION = { "google/electra-small-generator": {"do_lower_case": True}, "google/electra-base-generator": {"do_lower_case": True}, "google/electra-large-generator": {"do_lower_case": True}, "google/electra-small-discriminator": {"do_lower_case": True}, "google/electra-base-discriminator": {"do_lower_case": True}, "google/electra-large-discriminator": {"do_lower_case": True}, } class ElectraTokenizer(BertTokenizer): r""" Constructs an Electra tokenizer. :class:`~transformers1.ElectraTokenizer` is identical to :class:`~transformers1.BertTokenizer` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizer` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION class ElectraTokenizerFast(BertTokenizerFast): r""" Constructs a "Fast" Electra Fast tokenizer (backed by HuggingFace's `tokenizers` library). :class:`~transformers1.ElectraTokenizerFast` is identical to :class:`~transformers1.BertTokenizerFast` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizerFast` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for Flaubert, based on XLM.""" import logging import unicodedata import six from .tokenization_xlm import XLMTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json", "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json", "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json", "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json", }, "merges_file": { "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt", "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt", "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt", "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "flaubert/flaubert_small_cased": 512, "flaubert/flaubert_base_uncased": 512, "flaubert/flaubert_base_cased": 512, "flaubert/flaubert_large_cased": 512, } PRETRAINED_INIT_CONFIGURATION = { "flaubert/flaubert_small_cased": {"do_lowercase": False}, "flaubert/flaubert_base_uncased": {"do_lowercase": True}, "flaubert/flaubert_base_cased": {"do_lowercase": False}, "flaubert/flaubert_large_cased": {"do_lowercase": False}, } def convert_to_unicode(text): """ Converts `text` to Unicode (if it's not already), assuming UTF-8 input. """ # six_ensure_text is copied from https://github.com/benjaminp/six def six_ensure_text(s, encoding="utf-8", errors="strict"): if isinstance(s, six.binary_type): return s.decode(encoding, errors) elif isinstance(s, six.text_type): return s else: raise TypeError("not expecting type '%s'" % type(s)) return six_ensure_text(text, encoding="utf-8", errors="ignore") class FlaubertTokenizer(XLMTokenizer): """ BPE tokenizer for Flaubert - Moses preprocessing & tokenization - Normalize all inputs text - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ (ex: "__classify__") to a vocabulary - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) This tokenizer inherits from :class:`~transformers1.XLMTokenizer`. Please check the superclass for usage examples and documentation regarding arguments. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, do_lowercase=False, **kwargs): super().__init__(**kwargs) self.do_lowercase = do_lowercase self.do_lowercase_and_remove_accent = False def preprocess_text(self, text): text = text.replace("``", '"').replace("''", '"') text = convert_to_unicode(text) text = unicodedata.normalize("NFC", text) if self.do_lowercase: text = text.lower() return text def _tokenize(self, text, bypass_tokenizer=False): """ Tokenize a string given language code using Moses. Details of tokenization: - [sacremoses](https://github.com/alvations/sacremoses): port of Moses - Install with `pip install sacremoses` Args: - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE. Returns: List of tokens. """ lang = "fr" if lang and self.lang2id and lang not in self.lang2id: logger.error( "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model." ) if bypass_tokenizer: text = text.split() else: text = self.preprocess_text(text) text = self.moses_pipeline(text, lang=lang) text = self.moses_tokenize(text, lang=lang) split_tokens = [] for token in text: if token: split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" import json import logging import os from functools import lru_cache import regex as re from tokenizers import ByteLevelBPETokenizer from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json", "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json", }, "merges_file": { "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt", "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "gpt2": 1024, "gpt2-medium": 1024, "gpt2-large": 1024, "gpt2-xl": 1024, "distilgpt2": 1024, } @lru_cache() def bytes_to_unicode(): """ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. """ bs = ( list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) ) cs = bs[:] n = 0 for b in range(2 ** 8): if b not in bs: bs.append(b) cs.append(2 ** 8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs class GPT2Tokenizer(PreTrainedTokenizer): """ GPT-2 BPE tokenizer. Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The beginning of sequence token. eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The end of sequence token. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, merges_file, errors="replace", unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs ): super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: bpe_merges = merges_handle.read().split("\n")[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_merges] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token) pairs = get_pairs(word) if not pairs: return token while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = " ".join(word) self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): token = "".join( self.byte_encoder[b] for b in token.encode("utf-8") ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ text = "".join(tokens) text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file def prepare_for_tokenization(self, text, **kwargs): if "add_prefix_space" in kwargs and kwargs["add_prefix_space"]: return " " + text return text class GPT2TokenizerFast(PreTrainedTokenizerFast): """ Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library). Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The beginning of sequence token. eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The end of sequence token. add_prefix_space (:obj:`bool`, `optional`, defaults to `False`): Whether to add a leading space to the first word. This allows to treat the leading word just as any other word. (GPT2 tokenizer detect beginning of words by the preceeding space) trim_offsets (:obj:`bool`, `optional`, defaults to `True`): Whether the post processing step should trim offsets to avoid including whitespaces. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, merges_file, unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", add_prefix_space=False, trim_offsets=True, **kwargs ): super().__init__( ByteLevelBPETokenizer( vocab_file=vocab_file, merges_file=merges_file, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets, ), bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs, ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_longformer.py ================================================ # coding=utf-8 # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast logger = logging.getLogger(__name__) # vocab and merges same as roberta vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" _all_longformer_models = [ "allenai/longformer-base-4096", "allenai/longformer-large-4096", "allenai/longformer-large-4096-finetuned-triviaqa", "allenai/longformer-base-4096-extra.pos.embd.only", "allenai/longformer-large-4096-extra.pos.embd.only", ] PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "allenai/longformer-base-4096": 4096, "allenai/longformer-large-4096": 4096, "allenai/longformer-large-4096-finetuned-triviaqa": 4096, "allenai/longformer-base-4096-extra.pos.embd.only": 4096, "allenai/longformer-large-4096-extra.pos.embd.only": 4096, } class LongformerTokenizer(RobertaTokenizer): # merges and vocab same as Roberta max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_vocab_files_map = { "vocab_file": {m: vocab_url for m in _all_longformer_models}, "merges_file": {m: merges_url for m in _all_longformer_models}, } class LongformerTokenizerFast(RobertaTokenizerFast): # merges and vocab same as Roberta max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_vocab_files_map = { "vocab_file": {m: vocab_url for m in _all_longformer_models}, "merges_file": {m: merges_url for m in _all_longformer_models}, } ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_marian.py ================================================ import json import re import warnings from pathlib import Path from shutil import copyfile from typing import Dict, List, Optional, Tuple, Union import sentencepiece from .file_utils import S3_BUCKET_PREFIX from .tokenization_utils import BatchEncoding, PreTrainedTokenizer vocab_files_names = { "source_spm": "source.spm", "target_spm": "target.spm", "vocab": "vocab.json", "tokenizer_config_file": "tokenizer_config.json", } MODEL_NAMES = ("opus-mt-en-de",) # TODO(SS): delete this, the only required constant is vocab_files_names PRETRAINED_VOCAB_FILES_MAP = { k: {m: f"{S3_BUCKET_PREFIX}/Helsinki-NLP/{m}/{fname}" for m in MODEL_NAMES} for k, fname in vocab_files_names.items() } # Example URL https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/vocab.json class MarianTokenizer(PreTrainedTokenizer): """Sentencepiece tokenizer for marian. Source and target languages have different SPM models. The logic is use the relevant source_spm or target_spm to encode txt as pieces, then look up each piece in a vocab dictionary. Examples:: from transformers1 import MarianTokenizer tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de') src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."] tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts) # keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]. # model(**batch) should work """ vocab_files_names = vocab_files_names pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = {m: 512 for m in MODEL_NAMES} model_input_names = ["attention_mask"] # actually attention_mask, decoder_attention_mask language_code_re = re.compile(">>.+<<") # type: re.Pattern def __init__( self, vocab=None, source_spm=None, target_spm=None, source_lang=None, target_lang=None, unk_token="", eos_token="", pad_token="", max_len=512, **kwargs, ): super().__init__( # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id max_len=max_len, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, **kwargs, ) self.encoder = load_json(vocab) if self.unk_token not in self.encoder: raise KeyError(" token must be in vocab") assert self.pad_token in self.encoder self.decoder = {v: k for k, v in self.encoder.items()} self.source_lang = source_lang self.target_lang = target_lang self.supported_language_codes: list = [k for k in self.encoder if k.startswith(">>") and k.endswith("<<")] self.spm_files = [source_spm, target_spm] # load SentencePiece model for pre-processing self.spm_source = load_spm(source_spm) self.spm_target = load_spm(target_spm) self.current_spm = self.spm_source # Multilingual target side: default to using first supported language code. self._setup_normalizer() def _setup_normalizer(self): try: from mosestokenizer import MosesPunctuationNormalizer self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang) except ImportError: warnings.warn("Recommended: pip install mosestokenizer") self.punc_normalizer = lambda x: x def normalize(self, x: str) -> str: """Cover moses empty string edge case. They return empty list for '' input!""" return self.punc_normalizer(x) if x else "" def _convert_token_to_id(self, token): return self.encoder.get(token, self.encoder[self.unk_token]) def remove_language_code(self, text: str): """Remove language codes like <> before sentencepiece""" match = self.language_code_re.match(text) code: list = [match.group(0)] if match else [] return code, self.language_code_re.sub("", text) def _tokenize(self, text: str) -> List[str]: code, text = self.remove_language_code(text) pieces = self.current_spm.EncodeAsPieces(text) return code + pieces def _convert_id_to_token(self, index: int) -> str: """Converts an index (integer) in a token (str) using the encoder.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens: List[str]) -> str: """Uses target language sentencepiece model""" return self.spm_target.DecodePieces(tokens) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: """Build model inputs from a sequence by appending eos_token_id.""" if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] # We don't expect to process pairs, but leave the pair logic for API consistency return token_ids_0 + token_ids_1 + [self.eos_token_id] def prepare_translation_batch( self, src_texts: List[str], tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, pad_to_max_length: bool = True, return_tensors: str = "pt", ) -> BatchEncoding: """Prepare model inputs for translation. For best performance, translate one sentence at a time. Arguments: src_texts: list of src language texts tgt_texts: list of tgt language texts max_length: (None) defer to config (1024 for mbart-large-en-ro) pad_to_max_length: (bool) return_tensors: (str) default "pt" returns pytorch tensors, pass None to return lists. Returns: BatchEncoding: with keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask] all shaped bs, seq_len. (BatchEncoding is a dict of string -> tensor or lists). If no tgt_text is specified, the only keys will be input_ids and attention_mask. """ if "" in src_texts: raise ValueError(f"found empty string in src_texts: {src_texts}") self.current_spm = self.spm_source src_texts = [self.normalize(t) for t in src_texts] # this does not appear to do much model_inputs: BatchEncoding = self.batch_encode_plus( src_texts, add_special_tokens=True, return_tensors=return_tensors, max_length=max_length, pad_to_max_length=pad_to_max_length, ) if tgt_texts is None: return model_inputs self.current_spm = self.spm_target decoder_inputs: BatchEncoding = self.batch_encode_plus( tgt_texts, add_special_tokens=True, return_tensors=return_tensors, max_length=max_length, pad_to_max_length=pad_to_max_length, ) for k, v in decoder_inputs.items(): model_inputs[f"decoder_{k}"] = v self.current_spm = self.spm_source return model_inputs @property def vocab_size(self) -> int: return len(self.encoder) def save_vocabulary(self, save_directory: str) -> Tuple[str]: """save vocab file to json and copy spm files from their original path.""" save_dir = Path(save_directory) assert save_dir.is_dir(), f"{save_directory} should be a directory" save_json(self.encoder, save_dir / self.vocab_files_names["vocab"]) for f in self.spm_files: dest_path = save_dir / Path(f).name if not dest_path.exists(): copyfile(f, save_dir / Path(f).name) return tuple(save_dir / f for f in self.vocab_files_names) def get_vocab(self) -> Dict: vocab = self.encoder.copy() vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self) -> Dict: state = self.__dict__.copy() state.update({k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer"]}) return state def __setstate__(self, d: Dict) -> None: self.__dict__ = d self.spm_source, self.spm_target = (load_spm(f) for f in self.spm_files) self.current_spm = self.spm_source self._setup_normalizer() def num_special_tokens_to_add(self, **unused): """Just EOS""" return 1 def _special_token_mask(self, seq): all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp all_special_ids.remove(self.unk_token_id) # is only sometimes special return [1 if x in all_special_ids else 0 for x in seq] def get_special_tokens_mask( self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False ) -> List[int]: """Get list where entries are [1] if a token is [eos] or [pad] else 0.""" if already_has_special_tokens: return self._special_token_mask(token_ids_0) elif token_ids_1 is None: return self._special_token_mask(token_ids_0) + [1] else: return self._special_token_mask(token_ids_0 + token_ids_1) + [1] def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: spm = sentencepiece.SentencePieceProcessor() spm.Load(path) return spm def save_json(data, path: str) -> None: with open(path, "w") as f: json.dump(data, f, indent=2) def load_json(path: str) -> Union[Dict, List]: with open(path, "r") as f: return json.load(f) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_openai.py ================================================ # coding=utf-8 # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" import json import logging import os import re from tokenizers import CharBPETokenizer from .tokenization_bert import BasicTokenizer from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"}, "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "openai-gpt": 512, } def get_pairs(word): """ Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length strings) """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs def text_standardize(text): """ fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization """ text = text.replace("—", "-") text = text.replace("–", "-") text = text.replace("―", "-") text = text.replace("…", "...") text = text.replace("´", "'") text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text) text = re.sub(r"\s*\n\s*", " \n ", text) text = re.sub(r"[^\S\n]+", " ", text) return text.strip() class OpenAIGPTTokenizer(PreTrainedTokenizer): """ BPE tokenizer. Peculiarities: - lower case all inputs - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): super().__init__(unk_token=unk_token, **kwargs) try: import ftfy from spacy.lang.en import English _nlp = English() self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) self.fix_text = None with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: merges = merges_handle.read().split("\n")[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + "",) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token + "" while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = " ".join(word) if word == "\n ": word = "\n" self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(" ")]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an id in a token (BPE) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = "".join(tokens).replace("", " ").strip() return out_string def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): """ Construct a "Fast" BPE tokenizer for OpenAI GPT (backed by HuggingFace's `tokenizers` library). Peculiarities: - lower case all inputs - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): kwargs.setdefault("unk_token", unk_token) super().__init__( CharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, lowercase=True), **kwargs, ) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_reformer.py ================================================ # coding=utf-8 # Copyright 2020 The Trax Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization class for model Reformer.""" import logging import os from shutil import copyfile from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) SPIECE_UNDERLINE = "▁" #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances #################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to pretrained vocabulary URL for all the model shortcut names. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.model" } } #################################################### # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/reformer-crime-and-punishment": 524288, } class ReformerTokenizer(PreTrainedTokenizer): """ Constructs an Reformer tokenizer. Based on `SentencePiece `__ . This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a `.spm` extension) that contains the vocabulary necessary to instantiate a tokenizer. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`): Additional special tokens used by the tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, eos_token="", unk_token="", pad_token="", additional_special_tokens=[], **kwargs ): super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use ReformerTokenizer:" "https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return self.sp_model.get_piece_size() def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use ReformerTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def _tokenize(self, text, sample=False): """ Take as input a string and return a list of strings (tokens) for words/sub-words """ if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) return pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) return token def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = self.sp_model.decode_pieces(tokens) return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for RoBERTa.""" import logging from typing import List, Optional from tokenizers import AddedToken from tokenizers.processors import RobertaProcessing from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json", "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", }, "merges_file": { "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt", "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "roberta-base": 512, "roberta-large": 512, "roberta-large-mnli": 512, "distilroberta-base": 512, "roberta-base-openai-detector": 512, "roberta-large-openai-detector": 512, } class RobertaTokenizer(GPT2Tokenizer): """ Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, merges_file, errors="replace", bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", **kwargs ): super().__init__( vocab_file=vocab_file, merges_file=merges_file, errors=errors, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A RoBERTa sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] def prepare_for_tokenization(self, text, add_special_tokens=False, **kwargs): if "add_prefix_space" in kwargs: add_prefix_space = kwargs["add_prefix_space"] else: add_prefix_space = add_special_tokens if add_prefix_space and not text[0].isspace(): text = " " + text return text class RobertaTokenizerFast(GPT2TokenizerFast): """ Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library). Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The beginning of sequence token. eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The end of sequence token. add_prefix_space (:obj:`bool`, `optional`, defaults to `False`): Whether to add a leading space to the first word. This allows to treat the leading word just as any other word. (GPT2 tokenizer detect beginning of words by the preceeding space) trim_offsets (:obj:`bool`, `optional`, defaults to `True`): Whether the post processing step should trim offsets to avoid including whitespaces. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, merges_file, errors="replace", bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", add_prefix_space=True, trim_offsets=True, **kwargs ): kwargs.setdefault("pad_token", pad_token) kwargs.setdefault("sep_token", sep_token) kwargs.setdefault("cls_token", cls_token) kwargs.setdefault("mask_token", mask_token) super().__init__( vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets, **kwargs, ) self.backend_tokenizer._tokenizer.post_processor = RobertaProcessing( sep=(sep_token, self.sep_token_id), cls=(cls_token, self.cls_token_id), add_prefix_space=add_prefix_space, trim_offsets=trim_offsets, ) self.backend_tokenizer.add_special_tokens([kwargs["mask_token"]]) @PreTrainedTokenizer.mask_token.setter def mask_token(self, value): if not isinstance(value, AddedToken): value = AddedToken(value, lstrip=True) self._mask_token = str(value) self._maybe_update_backend([value]) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] if token_ids_1 is None: return output return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_t5.py ================================================ # coding=utf-8 # Copyright 2018 T5 Authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization class for model T5.""" import logging import os import re from shutil import copyfile from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) SPIECE_UNDERLINE = "▁" #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances #################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to pretrained vocabulary URL for all the model shortcut names. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", } } #################################################### # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, "t5-base": 512, "t5-large": 512, "t5-3b": 512, "t5-11b": 512, } class T5Tokenizer(PreTrainedTokenizer): """ Constructs an XLNet tokenizer. Based on `SentencePiece `__ . This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a `.spm` extension) that contains the vocabulary necessary to instantiate a tokenizer. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`): Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are accessible as "" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are indexed from the end of the vocabulary up to beginnning ("" is the last token in the vocabulary like in T5 preprocessing see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117) additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`): Additional special tokens used by the tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, eos_token="", unk_token="", pad_token="", extra_ids=100, additional_special_tokens=None, **kwargs ): # Add extra_ids to the special token list if extra_ids > 0: if additional_special_tokens is None: additional_special_tokens = [] additional_special_tokens.extend(["".format(i) for i in range(extra_ids)]) super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use T5Tokenizer:" "https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.vocab_file = vocab_file self._extra_ids = extra_ids self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return self.sp_model.get_piece_size() + self._extra_ids def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use T5Tokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def _tokenize(self, text, sample=False): """ Take as input a string and return a list of strings (tokens) for words/sub-words """ if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) return pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token.startswith("", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) else: token = "".format(self.vocab_size - 1 - index) return token def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = self.sp_model.decode_pieces(tokens) return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. """ import glob import logging import os import pickle import re from collections import Counter, OrderedDict from typing import Optional import numpy as np from tokenizers import Tokenizer from tokenizers.implementations import BaseTokenizer from tokenizers.models import WordLevel from tokenizers.normalizers import Lowercase, Sequence, Strip, unicode_normalizer_from_str from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit from tokenizers.processors import BertProcessing from .file_utils import cached_path, is_torch_available from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast if is_torch_available(): import torch logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"} VOCAB_FILES_NAMES_FAST = {"pretrained_vocab_file": "vocab.json", "vocab_file": "vocab.json"} PRETRAINED_VOCAB_FILES_MAP = { "pretrained_vocab_file": { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin", } } PRETRAINED_VOCAB_FILES_MAP_FAST = { "pretrained_vocab_file": { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.json", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "transfo-xl-wt103": None, } PRETRAINED_CORPUS_ARCHIVE_MAP = { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin", } CORPUS_NAME = "corpus.bin" class TransfoXLTokenizer(PreTrainedTokenizer): """ Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = [] def __init__( self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk_token="", eos_token="", additional_special_tokens=[""], **kwargs ): super().__init__( unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs ) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\:;<=>?@[\\]^_`{|}~' # noqa: W605 self.punction_without_space_before_pattern = re.compile(r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern() try: if pretrained_vocab_file is not None: # Hack because, honestly this tokenizer was not made to be used # in a library like ours, at all. vocab_dict = torch.load(pretrained_vocab_file) for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value if vocab_file is not None: self.build_vocab() except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format(pretrained_vocab_file) ) if vocab_file is not None: self.build_vocab() def _compile_space_around_punctuation_pattern(self): look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols) look_ahead_to_match_all_except_space = "(?=[^\s])" # noqa: W605 return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space) def count_file(self, path, verbose=False, add_eos=False): if verbose: logger.info("counting file {} ...".format(path)) assert os.path.exists(path) sents = [] with open(path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) symbols = self.tokenize(line, add_eos=add_eos) self.counter.update(symbols) sents.append(symbols) return sents def count_sents(self, sents, verbose=False): """ sents : a list of sentences, each a list of tokenized symbols """ if verbose: logger.info("counting {} sents ...".format(len(sents))) for idx, symbols in enumerate(sents): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) self.counter.update(symbols) def _build_from_file(self, vocab_file): self.idx2sym = [] self.sym2idx = OrderedDict() with open(vocab_file, "r", encoding="utf-8") as f: for line in f: symb = line.strip().split()[0] self.add_symbol(symb) if "" in self.sym2idx: self.unk_idx = self.sym2idx[""] elif "" in self.sym2idx: self.unk_idx = self.sym2idx[""] else: raise ValueError("No token in vocabulary") def save_vocabulary(self, vocab_path): """ Save the vocabulary and special tokens file to a directory. Args: vocab_path (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ logger.warning( "Please note you will not be able to load the save vocabulary in" " Rust-based TransfoXLTokenizerFast as they don't share the same structure." ) if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"]) else: vocab_file = vocab_path torch.save(self.__dict__, vocab_file) return (vocab_file,) def build_vocab(self): if self.vocab_file: logger.info("building vocab from {}".format(self.vocab_file)) self._build_from_file(self.vocab_file) logger.info("final vocab size {}".format(len(self))) else: logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size)) self.idx2sym = [] self.sym2idx = OrderedDict() for sym in self.special: self.add_special(sym) for sym, cnt in self.counter.most_common(self.max_size): if cnt < self.min_freq: break self.add_symbol(sym) logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter))) def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False): if verbose: logger.info("encoding file {} ...".format(path)) assert os.path.exists(path) encoded = [] with open(path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos) encoded.append(self.convert_to_tensor(symbols)) if ordered: encoded = torch.cat(encoded) return encoded def encode_sents(self, sents, ordered=False, verbose=False): if verbose: logger.info("encoding {} sents ...".format(len(sents))) encoded = [] for idx, symbols in enumerate(sents): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) encoded.append(self.convert_to_tensor(symbols)) if ordered: encoded = torch.cat(encoded) return encoded def add_special(self, sym): if sym not in self.sym2idx: self.idx2sym.append(sym) self.sym2idx[sym] = len(self.idx2sym) - 1 setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym]) def add_symbol(self, sym): if sym not in self.sym2idx: self.idx2sym.append(sym) self.sym2idx[sym] = len(self.idx2sym) - 1 def _convert_id_to_token(self, idx): """Converts an id in a token (BPE) using the vocab.""" assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx) return self.idx2sym[idx] def _convert_token_to_id(self, sym): """ Converts a token (str) in an id using the vocab. """ if sym in self.sym2idx: return self.sym2idx[sym] else: # logger.info('encounter unk {}'.format(sym)) # assert '' not in sym if hasattr(self, "unk_idx"): return self.sym2idx.get(sym, self.unk_idx) # Backward compatibility with pre-trained models elif "" in self.sym2idx: return self.sym2idx[""] elif "" in self.sym2idx: return self.sym2idx[""] else: raise ValueError("Token not in vocabulary and no token in vocabulary for replacement") def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = " ".join(tokens).strip() return out_string def convert_to_tensor(self, symbols): return torch.LongTensor(self.convert_tokens_to_ids(symbols)) @property def vocab_size(self): return len(self.idx2sym) def get_vocab(self): return dict(self.sym2idx, **self.added_tokens_encoder) def _tokenize(self, line, add_eos=False, add_double_eos=False): line = line.strip() # convert to lower case if self.lower_case: line = line.lower() # empty delimiter '' will evaluate False if self.delimiter == "": symbols = line else: symbols = line.split(self.delimiter) if add_double_eos: # lm1b return [""] + symbols + [""] elif add_eos: return symbols + [""] else: return symbols def prepare_for_tokenization(self, text, **kwargs): # add spaces before punctuation symbols as should be done in transfo-xl if "add_space_before_punct_symbol" in kwargs and kwargs["add_space_before_punct_symbol"]: text = self.punctuation_with_space_around_pattern.sub(r" ", text) elif self.punction_without_space_before_pattern.search(text): # searches until the first occurence of a punctuation symbol without surrounding spaces logger.warning( "You might want to consider setting `add_space_before_punct_symbol=True` as an argument to the `tokenizer.encode()` to avoid tokenizing words with punctuation symbols to the `` token" ) return text class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer): def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file) ) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token)) ) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters) class TransfoXLTokenizerFast(PreTrainedTokenizerFast): """ Construct a "Fast" Transformer-XL tokenizer (backed by HuggingFace's `tokenizers` library). The Transformer-XL tokenizer is a word-level tokenizer (no sub-word tokenization). Adapted from Vocab class in https://github.com/kimiyoung/transformer-xl This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. """ vocab_files_names = VOCAB_FILES_NAMES_FAST pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = [] def __init__( self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk_token="", eos_token="", additional_special_tokens=[""], add_eos=False, add_double_eos=False, normalization=None, **kwargs ): super().__init__( _TransfoXLDelimiterLookupTokenizer( vocab_file=vocab_file or pretrained_vocab_file, delimiter=delimiter, lowercase=lower_case, unk_token=unk_token, eos_token=eos_token, add_eos=add_eos, add_double_eos=add_double_eos, normalization=normalization, ), unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs, ) def save_pretrained(self, save_directory): logger.warning( "Please note you will not be able to load the vocabulary in" " Python-based TransfoXLTokenizer as they don't share the same structure." ) return super().save_pretrained(save_directory) class LMOrderedIterator(object): def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): """ data -- LongTensor -- the LongTensor is strictly ordered """ self.bsz = bsz self.bptt = bptt self.ext_len = ext_len if ext_len is not None else 0 self.device = device # Work out how cleanly we can divide the dataset into bsz parts. self.n_step = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, self.n_step * bsz) # Evenly divide the data across the bsz batches. self.data = data.view(bsz, -1).t().contiguous().to(device) # Number of mini-batches self.n_batch = (self.n_step + self.bptt - 1) // self.bptt def get_batch(self, i, bptt=None): if bptt is None: bptt = self.bptt seq_len = min(bptt, self.data.size(0) - 1 - i) end_idx = i + seq_len beg_idx = max(0, i - self.ext_len) data = self.data[beg_idx:end_idx] target = self.data[i + 1 : i + 1 + seq_len] data_out = data.transpose(0, 1).contiguous().to(self.device) target_out = target.transpose(0, 1).contiguous().to(self.device) return data_out, target_out, seq_len def get_fixlen_iter(self, start=0): for i in range(start, self.data.size(0) - 1, self.bptt): yield self.get_batch(i) def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): max_len = self.bptt + max_deviation * std i = start while True: bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0 bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) data, target, seq_len = self.get_batch(i, bptt) i += seq_len yield data, target, seq_len if i >= self.data.size(0) - 2: break def __iter__(self): return self.get_fixlen_iter() class LMShuffledIterator(object): def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False): """ data -- list[LongTensor] -- there is no order among the LongTensors """ self.data = data self.bsz = bsz self.bptt = bptt self.ext_len = ext_len if ext_len is not None else 0 self.device = device self.shuffle = shuffle def get_sent_stream(self): # index iterator epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data))) # sentence iterator for idx in epoch_indices: yield self.data[idx] def stream_iterator(self, sent_stream): # streams for each data in the batch streams = [None] * self.bsz data = torch.LongTensor(self.bptt, self.bsz) target = torch.LongTensor(self.bptt, self.bsz) n_retain = 0 while True: # data : [n_retain+bptt x bsz] # target : [bptt x bsz] data[n_retain:].fill_(-1) target.fill_(-1) valid_batch = True for i in range(self.bsz): n_filled = 0 try: while n_filled < self.bptt: if streams[i] is None or len(streams[i]) <= 1: streams[i] = next(sent_stream) # number of new tokens to fill in n_new = min(len(streams[i]) - 1, self.bptt - n_filled) # first n_retain tokens are retained from last batch data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new] target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1] streams[i] = streams[i][n_new:] n_filled += n_new except StopIteration: valid_batch = False break if not valid_batch: return data_out = data.transpose(0, 1).contiguous().to(self.device) target_out = target.transpose(0, 1).contiguous().to(self.device) yield data_out, target_out, self.bptt n_retain = min(data.size(0), self.ext_len) if n_retain > 0: data[:n_retain] = data[-n_retain:] data.resize_(n_retain + self.bptt, data.size(1)) def __iter__(self): # sent_stream is an iterator sent_stream = self.get_sent_stream() for batch in self.stream_iterator(sent_stream): yield batch class LMMultiFileIterator(LMShuffledIterator): def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False): self.paths = paths self.vocab = vocab self.bsz = bsz self.bptt = bptt self.ext_len = ext_len if ext_len is not None else 0 self.device = device self.shuffle = shuffle def get_sent_stream(self, path): sents = self.vocab.encode_file(path, add_double_eos=True) if self.shuffle: np.random.shuffle(sents) sent_stream = iter(sents) return sent_stream def __iter__(self): if self.shuffle: np.random.shuffle(self.paths) for path in self.paths: # sent_stream is an iterator sent_stream = self.get_sent_stream(path) for batch in self.stream_iterator(sent_stream): yield batch class TransfoXLCorpus(object): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a pre-processed corpus. """ vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP: corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path] else: corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME) # redirect to the cache, if necessary try: resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Corpus '{}' was not found in corpus list ({}). " "We assumed '{}' was a path or url but couldn't find files {} " "at this path or url.".format( pretrained_model_name_or_path, ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, corpus_file, ) ) return None if resolved_corpus_file == corpus_file: logger.info("loading corpus file {}".format(corpus_file)) else: logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file)) # Instantiate tokenizer. corpus = cls(*inputs, **kwargs) corpus_dict = torch.load(resolved_corpus_file) for key, value in corpus_dict.items(): corpus.__dict__[key] = value corpus.vocab = vocab if corpus.train is not None: corpus.train = torch.tensor(corpus.train, dtype=torch.long) if corpus.valid is not None: corpus.valid = torch.tensor(corpus.valid, dtype=torch.long) if corpus.test is not None: corpus.test = torch.tensor(corpus.test, dtype=torch.long) return corpus def __init__(self, *args, **kwargs): self.vocab = TransfoXLTokenizer(*args, **kwargs) self.dataset = None self.train = None self.valid = None self.test = None def build_corpus(self, path, dataset): self.dataset = dataset if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "wt103": self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*", ) train_paths = glob.glob(train_path_pattern) # the vocab will load from file when build_vocab() is called self.vocab.build_vocab() if self.dataset in ["ptb", "wt2", "wt103"]: self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True) def get_iterator(self, split, *args, **kwargs): if split == "train": if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: data_iter = LMOrderedIterator(self.train, *args, **kwargs) elif self.dataset == "lm1b": kwargs["shuffle"] = True data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) elif split in ["valid", "test"]: data = self.valid if split == "valid" else self.test if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: data_iter = LMOrderedIterator(data, *args, **kwargs) elif self.dataset == "lm1b": data_iter = LMShuffledIterator(data, *args, **kwargs) return data_iter def get_lm_corpus(datadir, dataset): fn = os.path.join(datadir, "cache.pt") fn_pickle = os.path.join(datadir, "cache.pkl") if os.path.exists(fn): logger.info("Loading cached dataset...") corpus = torch.load(fn_pickle) elif os.path.exists(fn): logger.info("Loading cached dataset from pickle...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: logger.info("Producing dataset {}...".format(dataset)) kwargs = {} if dataset in ["wt103", "wt2"]: kwargs["special"] = [""] kwargs["lower_case"] = False elif dataset == "ptb": kwargs["special"] = [""] kwargs["lower_case"] = True elif dataset == "lm1b": kwargs["special"] = [] kwargs["lower_case"] = False kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt") elif dataset in ["enwik8", "text8"]: pass corpus = TransfoXLCorpus(datadir, dataset, **kwargs) torch.save(corpus, fn) return corpus ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_utils.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for python and fast tokenizers. Fast tokenizers are provided by HuggingFace's tokenizers library.""" import copy import functools import itertools import json import logging import operator import os import re import warnings from collections import UserDict, defaultdict from contextlib import contextmanager from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union from tokenizers import AddedToken as AddedTokenFast from tokenizers import Encoding as EncodingFast from tokenizers.decoders import Decoder as DecoderFast from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available, torch_required if is_tf_available(): import tensorflow as tf if is_torch_available(): import torch logger = logging.getLogger(__name__) SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER # Define type aliases and NamedTuples TextInput = str PreTokenizedInput = List[str] EncodedInput = List[int] TextInputPair = Tuple[str, str] PreTokenizedInputPair = Tuple[List[str], List[str]] EncodedInputPair = Tuple[List[int], List[int]] class CharSpan(NamedTuple): """ Character span in the original string Args: start: index of the first character in the original string end: index of the character following the last character in the original string """ start: int end: int class TokenSpan(NamedTuple): """ Token span in an encoded string (list of tokens) Args: start: index of the first token in the span end: index of the token following the last token in the span """ start: int end: int def flatten(x: Sequence): """ Flatten the provided (potentially nested) sequence Args: x (Sequence): Potentially nested sequence to flatten Returns: list: Flattened sequence """ return functools.reduce(operator.iconcat, x, []) @contextmanager def truncate_and_pad( tokenizer: BaseTokenizerFast, max_length: int, stride: int, strategy: str, pad_to_max_length: bool, padding_side: str, pad_token_id: int, pad_token_type_id: int, pad_token: str, ): """ This contextmanager is in charge of defining the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers library) and restore the tokenizer settings afterwards. This contextmanager assumes the provider tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a padding / truncation strategy before, then it will be reset to no padding/truncation when exiting the managed section. Args: tokenizer (BaseTokenizerFast): The tokenizer which will be used max_length (int): The maximum size of the sequence stride (int): The stride to use when handling overflow strategy (str): Overflowing logic to use pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length padding_side (str): "left" or "right" indicating the direction the output sequence will be padded pad_token_id (int): The integer representation of the padding token to use pad_token_type_id (int): The integer representation of the padding token type to use pad_token (str): The string representation of the padding token to use """ # Handle all the truncation and padding stuff if max_length is not None: tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy) if pad_to_max_length and (pad_token and pad_token_id >= 0): tokenizer.enable_padding( max_length=max_length, direction=padding_side, pad_id=pad_token_id, pad_type_id=pad_token_type_id, pad_token=pad_token, ) elif pad_to_max_length: logger.warning( "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n" "To remove this error, you can add a new pad token and then resize model embedding:\n" "\ttokenizer.pad_token = ''\n\tmodel.resize_token_embeddings(len(tokenizer))".format( pad_token, pad_token_id ) ) yield # TODO(morgan, anthony): once we have a simple way to serialize tokenizers maybe store and restore the state afterward # to avoid destructing the padding / truncation strategy as we do now. if max_length is not None: tokenizer.no_truncation() if pad_to_max_length and (pad_token and pad_token_id >= 0): tokenizer.no_padding() class BatchEncoding(UserDict): """ BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). This class is derived from a python Dictionary and can be used as a dictionnary. In addition, this class expose utility methods to map from word/char space to token space. Args: data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space the `EncodingFast` instance or list of instance (for batches) hold these informations. """ def __init__( self, data: Optional[Dict[str, Any]] = None, encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, ): super().__init__(data) if isinstance(encoding, EncodingFast): encoding = [encoding] self._encodings = encoding def __getitem__(self, item: Union[int, str]) -> EncodingFast: """ If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) If the key is an integer, get the EncodingFast for batch item with index `key` """ if isinstance(item, str): return self.data[item] elif self._encodings is not None: return self._encodings[item] else: raise KeyError( "Indexing with integers (to access backend Encoding for a given batch index) " "is not available when using Python based tokenizers" ) def __getattr__(self, item: str): return self.data[item] def keys(self): return self.data.keys() def values(self): return self.data.values() def items(self): return self.data.items() # After this point: # Extended properties and methods only available for fast (Rust-based) tokenizers # provided by HuggingFace tokenizers library. @property def encodings(self) -> Optional[List[EncodingFast]]: """ Return the list all encoding from the tokenization process Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer """ return self._encodings def tokens(self, batch_index: int = 0) -> List[int]: if not self._encodings: raise ValueError("tokens() is not available when using Python based tokenizers") return self._encodings[batch_index].tokens def words(self, batch_index: int = 0) -> List[Optional[int]]: if not self._encodings: raise ValueError("words() is not available when using Python based tokenizers") return self._encodings[batch_index].words def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: """ Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch. Can be called as: - self.token_to_word(token_index) if batch size is 1 - self.token_to_word(batch_index, token_index) if batch size is greater than 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_token_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the token in the sequence token_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the sequence. Returns: word_index (:obj:`int`): index of the word in the input sequence. """ if not self._encodings: raise ValueError("token_to_word() is not available when using Python based tokenizers") if token_index is not None: batch_index = batch_or_token_index else: batch_index = 0 token_index = batch_or_token_index if batch_index < 0: batch_index = self._batch_size + batch_index if token_index < 0: token_index = self._seq_len + token_index return self._encodings[batch_index].token_to_word(token_index) def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan: """ Get the encoded token span corresponding to a word in the sequence of the batch. Token spans are returned as a TokenSpan NamedTuple with: start: index of the first token end: index of the token following the last token Can be called as: - self.word_to_tokens(word_index) if batch size is 1 - self.word_to_tokens(batch_index, word_index) if batch size is greater or equal to 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_word_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of the word in the sequence word_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. Returns: token_span (:obj:`TokenSpan`): Span of tokens in the encoded sequence. TokenSpan are NamedTuple with: start: index of the first token end: index of the token following the last token """ if not self._encodings: raise ValueError("word_to_tokens() is not available when using Python based tokenizers") if word_index is not None: batch_index = batch_or_word_index else: batch_index = 0 word_index = batch_or_word_index if batch_index < 0: batch_index = self._batch_size + batch_index if word_index < 0: word_index = self._seq_len + word_index return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index))) def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: """ Get the character span corresponding to an encoded token in a sequence of the batch. Character spans are returned as a CharSpan NamedTuple with: start: index of the first character in the original string associated to the token end: index of the character following the last character in the original string associated to the token Can be called as: - self.token_to_chars(token_index) if batch size is 1 - self.token_to_chars(batch_index, token_index) if batch size is greater or equal to 1 Args: batch_or_token_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the token in the sequence token_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in the sequence. Returns: char_span (:obj:`CharSpan`): Span of characters in the original string. CharSpan are NamedTuple with: start: index of the first character in the original string end: index of the character following the last character in the original string """ if not self._encodings: raise ValueError("token_to_chars() is not available when using Python based tokenizers") if token_index is not None: batch_index = batch_or_token_index else: batch_index = 0 token_index = batch_or_token_index return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: """ Get the index of the token in the encoded output comprising a character in the original string for a sequence of the batch. Can be called as: - self.char_to_token(char_index) if batch size is 1 - self.char_to_token(batch_index, char_index) if batch size is greater or equal to 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_char_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the word in the sequence char_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. Returns: token_index (:obj:`int`): Index of the token. """ if not self._encodings: raise ValueError("char_to_token() is not available when using Python based tokenizers") if char_index is not None: batch_index = batch_or_char_index else: batch_index = 0 char_index = batch_or_char_index return self._encodings[batch_index].char_to_token(char_index) def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: """ Get the character span in the original string corresponding to given word in a sequence of the batch. Character spans are returned as a CharSpan NamedTuple with: start: index of the first character in the original string end: index of the character following the last character in the original string Can be called as: - self.word_to_chars(word_index) if batch size is 1 - self.word_to_chars(batch_index, word_index) if batch size is greater or equal to 1 Args: batch_or_word_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the word in the sequence word_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. Returns: char_span (:obj:`CharSpan` or :obj:`List[CharSpan]`): Span(s) of the associated character or characters in the string. CharSpan are NamedTuple with: start: index of the first character associated to the token in the original string end: index of the character following the last character associated to the token in the original string """ if not self._encodings: raise ValueError("word_to_chars() is not available when using Python based tokenizers") if word_index is not None: batch_index = batch_or_word_index else: batch_index = 0 word_index = batch_or_word_index return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: """ Get the word in the original string corresponding to a character in the original string of a sequence of the batch. Can be called as: - self.char_to_word(char_index) if batch size is 1 - self.char_to_word(batch_index, char_index) if batch size is greater than 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_char_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the character in the orginal string. char_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the orginal string. Returns: token_index (:obj:`int` or :obj:`List[int]`): Index or indices of the associated encoded token(s). """ if not self._encodings: raise ValueError("char_to_word() is not available when using Python based tokenizers") if char_index is not None: batch_index = batch_or_char_index else: batch_index = 0 char_index = batch_or_char_index return self._encodings[batch_index].char_to_word(char_index) @torch_required def to(self, device: str): """Send all values to device by calling v.to(device)""" self.data = {k: v.to(device) for k, v in self.data.items()} return self class SpecialTokensMixin: """ SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and handles specific behaviors related to special tokens. In particular, this class hold the attributes which can be used to directly access to these special tokens in a model-independant manner and allow to set and update the special tokens. """ SPECIAL_TOKENS_ATTRIBUTES = [ "bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token", "additional_special_tokens", ] def __init__(self, **kwargs): self._bos_token = None self._eos_token = None self._unk_token = None self._sep_token = None self._pad_token = None self._cls_token = None self._mask_token = None self._pad_token_type_id = 0 self._additional_special_tokens = [] for key, value in kwargs.items(): if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key == "additional_special_tokens": assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) setattr(self, key, value) elif isinstance(value, AddedTokenFast): setattr(self, key, str(value)) elif isinstance(value, str): setattr(self, key, value) else: raise TypeError( "special token {} has to be either str or AddedTokenFast but got: {}".format(key, type(value)) ) @property def bos_token(self): """ Beginning of sentence token (string). Log an error if used while not having been set. """ if self._bos_token is None: logger.error("Using bos_token, but it is not set yet.") return self._bos_token @property def eos_token(self): """ End of sentence token (string). Log an error if used while not having been set. """ if self._eos_token is None: logger.error("Using eos_token, but it is not set yet.") return self._eos_token @property def unk_token(self): """ Unknown token (string). Log an error if used while not having been set. """ if self._unk_token is None: logger.error("Using unk_token, but it is not set yet.") return self._unk_token @property def sep_token(self): """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ if self._sep_token is None: logger.error("Using sep_token, but it is not set yet.") return self._sep_token @property def pad_token(self): """ Padding token (string). Log an error if used while not having been set. """ if self._pad_token is None: logger.error("Using pad_token, but it is not set yet.") return self._pad_token @property def cls_token(self): """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ if self._cls_token is None: logger.error("Using cls_token, but it is not set yet.") return self._cls_token @property def mask_token(self): """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ if self._mask_token is None: logger.error("Using mask_token, but it is not set yet.") return self._mask_token @property def additional_special_tokens(self): """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ if self._additional_special_tokens is None: logger.error("Using additional_special_tokens, but it is not set yet.") return self._additional_special_tokens def _maybe_update_backend(self, value): """ To be overriden by derived class if a backend tokenizer has to be updated. """ pass @bos_token.setter def bos_token(self, value): self._bos_token = value self._maybe_update_backend([value]) @eos_token.setter def eos_token(self, value): self._eos_token = value self._maybe_update_backend([value]) @unk_token.setter def unk_token(self, value): self._unk_token = value self._maybe_update_backend([value]) @sep_token.setter def sep_token(self, value): self._sep_token = value self._maybe_update_backend([value]) @pad_token.setter def pad_token(self, value): self._pad_token = value self._maybe_update_backend([value]) @cls_token.setter def cls_token(self, value): self._cls_token = value self._maybe_update_backend([value]) @mask_token.setter def mask_token(self, value): self._mask_token = value self._maybe_update_backend([value]) @additional_special_tokens.setter def additional_special_tokens(self, value): self._additional_special_tokens = value self._maybe_update_backend(value) @property def bos_token_id(self): """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.bos_token) @property def eos_token_id(self): """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.eos_token) @property def unk_token_id(self): """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.unk_token) @property def sep_token_id(self): """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.sep_token) @property def pad_token_id(self): """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.pad_token) @property def pad_token_type_id(self): """ Id of the padding token type in the vocabulary.""" return self._pad_token_type_id @property def cls_token_id(self): """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.cls_token) @property def mask_token_id(self): """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.mask_token) @property def additional_special_tokens_ids(self): """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.additional_special_tokens) @property def special_tokens_map(self): """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their values ('', ''...) """ set_attr = {} for attr in self.SPECIAL_TOKENS_ATTRIBUTES: attr_value = getattr(self, "_" + attr) if attr_value: set_attr[attr] = attr_value return set_attr @property def all_special_tokens(self): """ List all the special tokens ('', ''...) mapped to class attributes (cls_token, unk_token...). """ all_toks = [] set_attr = self.special_tokens_map for attr_value in set_attr.values(): all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) all_toks = list(set(all_toks)) return all_toks @property def all_special_ids(self): """ List the vocabulary indices of the special tokens ('', ''...) mapped to class attributes (cls_token, unk_token...). """ all_toks = self.all_special_tokens all_ids = self.convert_tokens_to_ids(all_toks) return all_ids class PreTrainedTokenizer(SpecialTokensMixin): """ Base class for all tokenizers. Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). Class attributes (overridden by derived classes): - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method. Args: - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). no associated max_length can be found in ``max_model_input_sizes``. - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. Should be selected between ['right', 'left'] - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the model ("token_type_ids", "attention_mask"...). - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` """ vocab_files_names: Dict[str, str] = {} pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} max_model_input_sizes: Dict[str, int] = {} model_input_names: List[str] = ["token_type_ids", "attention_mask"] padding_side: str = "right" NO_PAD_TOKEN_FOR_BATCH_MSG = ( "No padding token is set for this model, therefore no batch can be made with uneven " "sequences. Set a padding token or adjust the lengths of the sequences building the " "batch so that every sequence is of the same length." ) UNEVEN_SEQUENCES_FOR_BATCH_MSG = ( "The sequences building the batch are not of the same size, no tensor " "can be built. Set `pad_to_max_length=True` to pad the smaller sequences" "up to the larger sequence's length." ) @property def vocab_size(self) -> int: """ Size of the base vocabulary (without the added tokens) """ raise NotImplementedError @property def is_fast(self) -> bool: return False @property def max_len(self) -> int: """ Kept here for backward compatibility. Now renamed to `model_max_length` to avoid ambiguity. """ return self.model_max_length @property def max_len_single_sentence(self) -> int: return self.model_max_length - self.num_special_tokens_to_add(pair=False) @property def max_len_sentences_pair(self) -> int: return self.model_max_length - self.num_special_tokens_to_add(pair=True) @max_len_single_sentence.setter def max_len_single_sentence(self, value) -> int: """ For backward compatibility, allow to try to setup 'max_len_single_sentence' """ if value == self.model_max_length - self.num_special_tokens_to_add(pair=False): logger.warning( "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." ) else: raise ValueError( "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." ) @max_len_sentences_pair.setter def max_len_sentences_pair(self, value) -> int: """ For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ if value == self.model_max_length - self.num_special_tokens_to_add(pair=True): logger.warning( "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." ) else: raise ValueError( "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." ) def get_vocab(self): """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """ raise NotImplementedError() def __init__(self, model_max_length=None, **kwargs): super().__init__(**kwargs) # For backward compatibility we fallback to set model_max_length from max_len if provided if "max_len" in kwargs: warnings.warn( "Parameter max_len is deprecated and will be removed in a future release. " "Use model_max_length instead.", category=FutureWarning, ) model_max_length = kwargs.pop("max_len") self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed. self.padding_side = kwargs.pop("padding_side", self.padding_side) assert self.padding_side in [ "right", "left", ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) # Added tokens self.added_tokens_encoder = {} self.unique_added_tokens_encoder = set() self.added_tokens_decoder = {} # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) self.init_inputs = () self.init_kwargs = {} def __len__(self): """ Size of the full vocabulary with the added tokens """ return self.vocab_size + len(self.added_tokens_encoder) @classmethod def from_pretrained(cls, *inputs, **kwargs): r""" Instantiate a :class:`~transformers1.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers1.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. cache_dir: (`optional`) string: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the vocabulary files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers1.PreTrainedTokenizer` for details. Examples:: # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer # Download vocabulary from S3 and cache. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 (user-uploaded) and cache. tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') # If the tokenizer uses a single vocabulary file, you can point directly to this file tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') # You can link tokens to special vocabulary when instantiating tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') # You should be sure '' is in the vocabulary when doing that. # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) assert tokenizer.unk_token == '' """ return cls._from_pretrained(*inputs, **kwargs) @classmethod def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} init_configuration = {} if pretrained_model_name_or_path in s3_models: # Get the vocabulary from AWS S3 bucket for file_id, map_list in cls.pretrained_vocab_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] if ( cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration ): init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() else: # Get the vocabulary from local files logger.info( "Model name '{}' not found in model shortcut name list ({}). " "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path ) ) if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): if len(cls.vocab_files_names) > 1: raise ValueError( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not supported." "Use a model identifier or the path to a directory instead." ) logger.warning( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated" ) file_id = list(cls.vocab_files_names.keys())[0] vocab_files[file_id] = pretrained_model_name_or_path else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, } # Look for the tokenizer main vocabulary files + the additional tokens files for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): if os.path.isdir(pretrained_model_name_or_path): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) if not os.path.exists(full_file_name): logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) full_file_name = None else: full_file_name = hf_bucket_url( pretrained_model_name_or_path, filename=file_name, use_cdn=False ) vocab_files[file_id] = full_file_name # Get files from url, cache, or disk depending on the case try: resolved_vocab_files = {} for file_id, file_path in vocab_files.items(): if file_path is None: resolved_vocab_files[file_id] = None else: resolved_vocab_files[file_id] = cached_path( file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) except EnvironmentError: if pretrained_model_name_or_path in s3_models: msg = "Couldn't reach server at '{}' to download vocabulary files." else: msg = ( "Model name '{}' was not found in tokenizers model name list ({}). " "We assumed '{}' was a path or url to a directory containing vocabulary files " "named {}, but couldn't find such vocabulary files at this path or url.".format( pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path, list(cls.vocab_files_names.values()), ) ) raise EnvironmentError(msg) if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): raise EnvironmentError( "Model name '{}' was not found in tokenizers model name list ({}). " "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " "named {} but couldn't find such vocabulary files at this path or url.".format( pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path, list(cls.vocab_files_names.values()), ) ) for file_id, file_path in vocab_files.items(): if file_path == resolved_vocab_files[file_id]: logger.info("loading file {}".format(file_path)) else: logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) if tokenizer_config_file is not None: with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: init_kwargs = json.load(tokenizer_config_handle) saved_init_inputs = init_kwargs.pop("init_inputs", ()) if not init_inputs: init_inputs = saved_init_inputs else: init_kwargs = init_configuration # Update with newly provided kwargs init_kwargs.update(kwargs) # Set max length if needed if pretrained_model_name_or_path in cls.max_model_input_sizes: # if we're using a pretrained model, ensure the tokenizer # wont index sequences longer than the number of positional embeddings model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] if model_max_length is not None and isinstance(model_max_length, (int, float)): init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path if special_tokens_map_file is not None: with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: special_tokens_map = json.load(special_tokens_map_handle) for key, value in special_tokens_map.items(): if key not in init_kwargs: init_kwargs[key] = value # Instantiate tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) except OSError: raise OSError( "Unable to load vocabulary from file. " "Please check that the provided vocabulary is accessible and not corrupted." ) # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` tokenizer.init_inputs = init_inputs tokenizer.init_kwargs = init_kwargs # update unique_added_tokens_encoder with special tokens for correct tokenization tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens)) # Add supplementary tokens. if added_tokens_file is not None: with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} tokenizer.added_tokens_encoder.update(added_tok_encoder) tokenizer.added_tokens_decoder.update(added_tok_decoder) tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys())) return tokenizer def save_pretrained(self, save_directory): """ Save the tokenizer vocabulary files together with: - added tokens, - special-tokens-to-class-attributes-mapping, - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). Warning: This won't save modifications you may have applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation). This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers1.PreTrainedTokenizer.from_pretrained` class method. """ if not os.path.isdir(save_directory): logger.error("Saving directory ({}) should be a directory".format(save_directory)) return special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) tokenizer_config = copy.deepcopy(self.init_kwargs) if len(self.init_inputs) > 0: tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) for file_id in self.vocab_files_names.keys(): tokenizer_config.pop(file_id, None) with open(tokenizer_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(tokenizer_config, ensure_ascii=False)) with open(special_tokens_map_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) if len(self.added_tokens_encoder) > 0: with open(added_tokens_file, "w", encoding="utf-8") as f: out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) f.write(out_str) vocab_files = self.save_vocabulary(save_directory) return vocab_files + (special_tokens_map_file, added_tokens_file) def save_vocabulary(self, save_directory) -> Tuple[str]: """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens and special token mappings. Please use :func:`~transformers1.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers1.PreTrainedTokenizer.from_pretrained` class method. """ raise NotImplementedError def add_tokens(self, new_tokens: Union[str, List[str]]) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Args: new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). Returns: Number of tokens added to the vocabulary. Examples:: # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. """ if not new_tokens: return 0 if not isinstance(new_tokens, list): new_tokens = [new_tokens] tokens_to_add = [] for token in new_tokens: assert isinstance(token, str) if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens: token = token.lower() if ( token != self.unk_token and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and token not in tokens_to_add ): tokens_to_add.append(token) logger.info("Adding %s to the vocabulary", token) added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} self.added_tokens_encoder.update(added_tok_encoder) self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens)) self.added_tokens_decoder.update(added_tok_decoder) return len(tokens_to_add) def num_special_tokens_to_add(self, pair=False): """ Returns the number of added tokens when encoding a sequence with special tokens. Note: This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. Args: pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the number of added tokens in the case of a single sequence if set to False. Returns: Number of tokens added to sequences """ token_ids_0 = [] token_ids_1 = [] return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) def add_special_tokens(self, special_tokens_dict): """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them to class attributes. If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary). Using `add_special_tokens` will ensure your special tokens can be used in several ways: - special tokens are carefully handled by the tokenizer (they are never split) - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') Args: special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``]. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). Returns: Number of tokens added to the vocabulary. Examples:: # Let's see how to add a new classification token to GPT-2 tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2') special_tokens_dict = {'cls_token': ''} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. assert tokenizer.cls_token == '' """ if not special_tokens_dict: return 0 added_tokens = 0 for key, value in special_tokens_dict.items(): assert key in self.SPECIAL_TOKENS_ATTRIBUTES if key == "additional_special_tokens": assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) added_tokens += self.add_tokens(value) else: assert isinstance(value, str) added_tokens += self.add_tokens([value]) logger.info("Assigning %s to the %s key of the tokenizer", value, key) setattr(self, key, value) return added_tokens def tokenize(self, text: TextInput, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Take care of added tokens. Args: text (:obj:`string`): The sequence to be encoded. **kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method. """ all_special_tokens = self.all_special_tokens text = self.prepare_for_tokenization(text, **kwargs) # TODO: should this be in the base class? def lowercase_text(t): # convert non-special tokens to lowercase escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t) if self.init_kwargs.get("do_lower_case", False): text = lowercase_text(text) def split_on_token(tok, text): result = [] split_text = text.split(tok) for i, sub_text in enumerate(split_text): sub_text = sub_text.rstrip() if i == 0 and not sub_text: result += [tok] elif i == len(split_text) - 1: if sub_text: result += [sub_text] else: pass else: if sub_text: result += [sub_text] result += [tok] return result def split_on_tokens(tok_list, text): if not text.strip(): return [] if not tok_list: return self._tokenize(text) tokenized_text = [] text_list = [text] for tok in tok_list: tokenized_text = [] for sub_text in text_list: if sub_text not in self.unique_added_tokens_encoder: tokenized_text += split_on_token(tok, sub_text) else: tokenized_text += [sub_text] text_list = tokenized_text return list( itertools.chain.from_iterable( ( self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token] for token in tokenized_text ) ) ) added_tokens = self.unique_added_tokens_encoder tokenized_text = split_on_tokens(added_tokens, text) return tokenized_text def _tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Do NOT take care of added tokens. """ raise NotImplementedError def convert_tokens_to_ids(self, tokens): """ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. """ if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) ids = [] for token in tokens: ids.append(self._convert_token_to_id_with_added_voc(token)) return ids def _convert_token_to_id_with_added_voc(self, token): if token is None: return None if token in self.added_tokens_encoder: return self.added_tokens_encoder[token] return self._convert_token_to_id(token) def _convert_token_to_id(self, token): raise NotImplementedError def encode( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, return_tensors: Optional[str] = None, **kwargs ): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Args: text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): If set to ``True``, the sequences will be encoded with the special tokens relative to their model. max_length (:obj:`int`, `optional`, defaults to :obj:`None`): If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary. You can set it to the maximal input size of the model with `max_length = tokenizer.model_max_length`. stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): String selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` or PyTorch :obj:`torch.Tensor` instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method """ encoded_inputs = self.encode_plus( text, text_pair=text_pair, max_length=max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, pad_to_max_length=pad_to_max_length, return_tensors=return_tensors, **kwargs, ) return encoded_inputs["input_ids"] def encode_plus( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, is_pretokenized: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, **kwargs ) -> BatchEncoding: """ Returns a dictionary containing the encoded sequence or sequence pair and additional information: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. Args: text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): If set to ``True``, the sequences will be encoded with the special tokens relative to their model. max_length (:obj:`int`, `optional`, defaults to :obj:`None`): If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary You can set it to the maximal input size of the model with `max_length = tokenizer.model_max_length`. stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): String selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. is_pretokenized (:obj:`bool`, defaults to :obj:`False`): Set to True to indicate the input is already tokenized return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` or PyTorch :obj:`torch.Tensor` instead of a list of python integers. return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are token type IDs? <../glossary.html#token-type-ids>`_ return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are attention masks? <../glossary.html#attention-mask>`__ return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return overflowing token information (default False). return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return special tokens mask information (default False). return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return (char_start, char_end) for each token (default False). If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. **kwargs: passed to the `self.tokenize()` method Return: A Dictionary of shape:: { input_ids: list[int], token_type_ids: list[int] if return_token_type_ids is True (default) attention_mask: list[int] if return_attention_mask is True (default) overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. """ def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): return text else: raise ValueError( "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." ) if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers." "To use this feature, change your tokenizer to one deriving from " "transformers1.PreTrainedTokenizerFast." "More information on available tokenizers at " "https://github.com/huggingface/transformers/pull/2674" ) # Throw an error if we can pad because there is no padding token if pad_to_max_length and self.pad_token_id is None: raise ValueError( "Unable to set proper padding strategy as the tokenizer does not have a padding token. " "In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " "or add a new pad token via the function add_special_tokens if you want to use a padding strategy" ) first_ids = get_input_ids(text) second_ids = get_input_ids(text_pair) if text_pair is not None else None return self.prepare_for_model( first_ids, pair_ids=second_ids, max_length=max_length, pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, return_tensors=return_tensors, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, ) def batch_encode_plus( self, batch_text_or_text_pairs: Union[ List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair], List[EncodedInput], List[EncodedInputPair], ], add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, is_pretokenized: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_masks: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_masks: bool = False, return_offsets_mapping: bool = False, return_lengths: bool = False, **kwargs ) -> BatchEncoding: """ Returns a dictionary containing the encoded sequence or sequence pair and additional information: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. Args: batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also: :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): Batch of sequences or pair of sequences to be encoded. This can be a list of string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see details in encode_plus) add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): If set to ``True``, the sequences will be encoded with the special tokens relative to their model. max_length (:obj:`int`, `optional`, defaults to :obj:`None`): If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): String selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. is_pretokenized (:obj:`bool`, defaults to :obj:`False`): Set to True to indicate the input is already tokenized return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` or PyTorch :obj:`torch.Tensor` instead of a list of python integers. return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are token type IDs? <../glossary.html#token-type-ids>`_ return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are attention masks? <../glossary.html#attention-mask>`__ return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return overflowing token information (default False). return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return special tokens mask information (default False). return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return (char_start, char_end) for each token (default False). If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on Rust-based tokenizers inheriting from PreTrainedTokenizerFast. return_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`): If set the resulting dictionary will include the length of each encoded inputs **kwargs: passed to the `self.tokenize()` method Return: A Dictionary of shape:: { input_ids: list[List[int]], token_type_ids: list[List[int]] if return_token_type_ids is True (default) attention_mask: list[List[int]] if return_attention_mask is True (default) overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. """ def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): return text else: raise ValueError( "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." ) # Throw an error if we can pad because there is no padding token if pad_to_max_length and self.pad_token_id is None: raise ValueError( "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy" ) if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers." "To use this feature, change your tokenizer to one deriving from " "transformers1.PreTrainedTokenizerFast." "More information on available tokenizers at " "https://github.com/huggingface/transformers/pull/2674" ) input_ids = [] for ids_or_pair_ids in batch_text_or_text_pairs: if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized: ids, pair_ids = ids_or_pair_ids else: ids, pair_ids = ids_or_pair_ids, None first_ids = get_input_ids(ids) second_ids = get_input_ids(pair_ids) if pair_ids is not None else None input_ids.append((first_ids, second_ids)) if max_length is None and pad_to_max_length: def total_sequence_length(input_pairs): first_ids, second_ids = input_pairs return len(first_ids) + ( self.num_special_tokens_to_add() if second_ids is None else (len(second_ids) + self.num_special_tokens_to_add(pair=True)) ) max_length = max([total_sequence_length(ids) for ids in input_ids]) batch_outputs = {} for first_ids, second_ids in input_ids: # Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by # the model. It adds special tokens, truncates sequences if overflowing while taking into account # the special tokens and manages a window stride for overflowing tokens outputs = self.prepare_for_model( first_ids, pair_ids=second_ids, max_length=max_length, pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, return_attention_mask=return_attention_masks, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_masks, return_lengths=return_lengths, return_tensors=None, # We will convert the whole batch to tensors at the end ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) if return_tensors is not None: self.convert_to_tensors_(batch_outputs, return_tensors) return BatchEncoding(batch_outputs) def convert_to_tensors_(self, batch_outputs: dict, return_tensors: str) -> None: # Do the tensor conversion in batch for key, value in batch_outputs.items(): if return_tensors == "tf" and is_tf_available(): try: batch_outputs[key] = tf.constant(value) except ValueError: if None in [item for sequence in value for item in sequence]: raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) else: raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) elif return_tensors == "pt" and is_torch_available(): try: batch_outputs[key] = torch.tensor(value) except ValueError: raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) except RuntimeError: if None in [item for sequence in value for item in sequence]: raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) else: raise elif return_tensors is not None: logger.warning( "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( return_tensors ) ) def prepare_for_model( self, ids: List[int], pair_ids: Optional[List[int]] = None, max_length: Optional[int] = None, add_special_tokens: bool = True, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_lengths: bool = False, ) -> BatchEncoding: """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: ids: list of tokenized input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. max_length: maximum length of the returned list. Will truncate by taking into account the special tokens. add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative to their model. stride: window stride for overflowing tokens. Can be useful to remove edge effect when using sequential list of inputs. The overflowing token will contains a part of the previous window of tokens. truncation_strategy: string selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default: set to model specifics). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). return_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`): If set the resulting dictionary will include the length of each encoded inputs Return: A Dictionary of shape:: { input_ids: list[int], token_type_ids: list[int] if return_token_type_ids is True (default) overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True length: int if return_lengths is True } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. - ``length``: this is the length of ``input_ids`` """ pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Truncation: Handle max sequence length total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) if max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation_strategy, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) # Build output dictionnary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Check lengths assert max_length is None or len(encoded_inputs["input_ids"]) <= max_length if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length: logger.warning( "Token indices sequence length is longer than the specified maximum sequence length " "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.model_max_length) ) # Padding needs_to_be_padded = pad_to_max_length and ( max_length and len(encoded_inputs["input_ids"]) < max_length or max_length is None and len(encoded_inputs["input_ids"]) < self.model_max_length and self.model_max_length <= LARGE_INTEGER ) if pad_to_max_length and max_length is None and self.model_max_length > LARGE_INTEGER: logger.warning( "Sequence can't be padded as no maximum length is specified and the model maximum length is too high." ) if needs_to_be_padded: difference = (max_length if max_length is not None else self.model_max_length) - len( encoded_inputs["input_ids"] ) if self.padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference if return_token_type_ids: encoded_inputs["token_type_ids"] = ( encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference ) if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference elif self.padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) if return_token_type_ids: encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ "token_type_ids" ] if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] else: raise ValueError("Invalid padding strategy:" + str(self.padding_side)) else: if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) if return_lengths: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) # Prepare model inputs as tensors if asked if return_tensors == "tf" and is_tf_available(): encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]]) if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]]) if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]]) elif return_tensors == "pt" and is_torch_available(): encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]]) if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]]) if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]]) elif return_tensors is not None: logger.warning( "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( return_tensors ) ) return BatchEncoding(encoded_inputs) def prepare_for_tokenization(self, text: str, **kwargs) -> str: """ Performs any necessary transformations before tokenization """ return text def truncate_sequences( self, ids: List[int], pair_ids: Optional[List[int]] = None, num_tokens_to_remove: int = 0, truncation_strategy: str = "longest_first", stride: int = 0, ) -> Tuple[List[int], List[int], List[int]]: """ Truncates a sequence pair in place to the maximum length. Args: ids: list of tokenized input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): number of tokens to remove using the truncation strategy truncation_strategy: string selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences). Overflowing tokens only contains overflow from the first sequence. - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. """ if num_tokens_to_remove <= 0: return ids, pair_ids, [] if truncation_strategy == "longest_first": overflowing_tokens = [] for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): overflowing_tokens = [ids[-1]] + overflowing_tokens ids = ids[:-1] else: pair_ids = pair_ids[:-1] window_len = min(len(ids), stride) if window_len > 0: overflowing_tokens = ids[-window_len:] + overflowing_tokens elif truncation_strategy == "only_first": assert len(ids) > num_tokens_to_remove window_len = min(len(ids), stride + num_tokens_to_remove) overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] elif truncation_strategy == "only_second": assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove window_len = min(len(pair_ids), stride + num_tokens_to_remove) overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] elif truncation_strategy == "do_not_truncate": raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.") else: raise ValueError( "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" ) return (ids, pair_ids, overflowing_tokens) def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: if token_ids_1 is None: return len(token_ids_0) * [0] return [0] * len(token_ids_0) + [1] * len(token_ids_1) def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. This implementation does not add special tokens. """ if token_ids_1 is None: return token_ids_0 return token_ids_0 + token_ids_1 def get_special_tokens_mask( self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids for sequence pairs already_has_special_tokens: (default False) Set to True if the token list is already formated with special tokens for the model Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[int, List[int]]: """ Converts a single index or a sequence of indices (integers) in a token " (resp.) a sequence of tokens (str), using the vocabulary and added tokens. Args: skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False """ if isinstance(ids, int): if ids in self.added_tokens_decoder: return self.added_tokens_decoder[ids] else: return self._convert_id_to_token(ids) tokens = [] for index in ids: index = int(index) if skip_special_tokens and index in self.all_special_ids: continue if index in self.added_tokens_decoder: tokens.append(self.added_tokens_decoder[index]) else: tokens.append(self._convert_id_to_token(index)) return tokens def _convert_id_to_token(self, index: int) -> str: raise NotImplementedError def convert_tokens_to_string(self, tokens: List[str]) -> str: """ Converts a sequence of tokens (string) in a single string. The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) but we often want to remove sub-word tokenization artifacts at the same time. """ return " ".join(self.convert_ids_to_tokens(tokens)) def decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True ) -> str: """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. Args: token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. skip_special_tokens: if set to True, will replace special tokens. clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. """ filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) # To avoid mixing byte-level and unicode for byte-level BPT # we need to build string separatly for added tokens and byte-level tokens # cf. https://github.com/huggingface/transformers/issues/1133 sub_texts = [] current_sub_text = [] for token in filtered_tokens: if skip_special_tokens and token in self.all_special_ids: continue if token in self.added_tokens_encoder: if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) current_sub_text = [] sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) text = " ".join(sub_texts) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text else: return text def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]: return [self.decode(seq, **kwargs) for seq in sequences] @staticmethod def clean_up_tokenization(out_string: str) -> str: """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. """ out_string = ( out_string.replace(" .", ".") .replace(" ?", "?") .replace(" !", "!") .replace(" ,", ",") .replace(" ' ", "'") .replace(" n't", "n't") .replace(" 'm", "'m") .replace(" 's", "'s") .replace(" 've", "'ve") .replace(" 're", "'re") ) return out_string class PreTrainedTokenizerFast(PreTrainedTokenizer): """ Base class for all fast tokenizers (wrapping HuggingFace tokenizers library). Inherit from PreTrainedTokenizer. Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). Class attributes (overridden by derived classes): - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method. Args: - ``tokenizer`` (`BaseTokenizerFast`): A Fast tokenizer from the HuggingFace tokenizer library (in low level Rust language) - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). no associated max_length can be found in ``max_model_input_sizes``. - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. Should be selected between ['right', 'left'] - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the model ("token_type_ids", "attention_mask"...). - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` """ def __init__(self, tokenizer: BaseTokenizerFast, **kwargs): if not isinstance(tokenizer, BaseTokenizerFast): raise ValueError( "Tokenizer should be an instance of a Tokenizer " "provided by HuggingFace tokenizers library." ) self._tokenizer: BaseTokenizerFast = tokenizer # Initialize all the rest of the kwargs super().__init__(**kwargs) @property def backend_tokenizer(self) -> BaseTokenizerFast: return self._tokenizer @property def decoder(self) -> DecoderFast: return self._tokenizer._tokenizer.decoder @property def is_fast(self) -> bool: return True @property def vocab_size(self) -> int: return self._tokenizer.get_vocab_size(with_added_tokens=False) def __len__(self) -> int: return self._tokenizer.get_vocab_size(with_added_tokens=True) def _maybe_update_backend(self, value): """ Update the backend fast tokenizer. Override method from base class SpecialTokensMixin """ self._tokenizer.add_special_tokens(value) def _convert_encoding( self, encoding: EncodingFast, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, ) -> Dict[str, Any]: """ Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict. Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are lists (overflows) of lists (tokens). If return_tensors is not None, these lists of lists are converted to 2-D tensors for input_ids, token_type_ids and attention_mask. Output shape: (overflows, sequence length) """ if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names if return_overflowing_tokens and encoding.overflowing is not None: encodings = [encoding] + encoding.overflowing else: encodings = [encoding] encoding_dict = defaultdict(list) for e in encodings: encoding_dict["input_ids"].append(e.ids) if return_token_type_ids: encoding_dict["token_type_ids"].append(e.type_ids) if return_attention_mask: encoding_dict["attention_mask"].append(e.attention_mask) if return_special_tokens_mask: encoding_dict["special_tokens_mask"].append(e.special_tokens_mask) if return_offsets_mapping: encoding_dict["offset_mapping"].append(e.offsets) if return_tensors is not None: for key, value in encoding_dict.items(): if return_tensors == "tf" and is_tf_available(): encoding_dict[key] = tf.constant(value) elif return_tensors == "pt" and is_torch_available(): encoding_dict[key] = torch.tensor(value) elif return_tensors is not None: logger.warning( "Unable to convert output to tensors format {}, " "PyTorch or TensorFlow is not available.".format(return_tensors) ) return encoding_dict def _convert_token_to_id_with_added_voc(self, token: int) -> str: index = self._tokenizer.token_to_id(token) if index is None: return self.unk_token_id return index def _convert_id_to_token(self, index: int) -> Optional[str]: return self._tokenizer.id_to_token(int(index)) def get_vocab(self): return self._tokenizer.get_vocab(True) def convert_tokens_to_string(self, tokens: List[int], skip_special_tokens: bool = False) -> str: return self._tokenizer.decode(tokens, skip_special_tokens) def add_tokens(self, new_tokens: List[Union[str, AddedTokenFast]]) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Args: new_tokens: string or list of string or AddedTokenFast. Each string is a token to add. Tokens are only added if they are not already in the vocabulary. AddedTokenFast wrap a string token to let you personnalize it's behavior (Whether this token should only match against single word, whether this token should strip all potential whitespaces on the left side, Whether this token should strip all potential whitespaces on the right side...). See details for AddedToken in HuggingFace tokenizers library. Returns: Number of tokens added to the vocabulary. Examples:: # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. """ if isinstance(new_tokens, str): new_tokens = [new_tokens] return self._tokenizer.add_tokens(new_tokens) def add_special_tokens(self, special_tokens_dict: dict) -> int: # Map special tokens to class attributes (self.pad_token...) super().add_special_tokens(special_tokens_dict) # If the backend tokenizer the only specificities of special tokens are that # - they will never be processed by the model, and # - they will be removed while decoding. # But they are not mapped to special attributes in the backend so we can just # send a list. tokens = [] for token in special_tokens_dict.values(): if isinstance(token, list): tokens += token else: tokens += [token] num_added_tokens = self._tokenizer.add_special_tokens(tokens) return num_added_tokens def num_special_tokens_to_add(self, pair: bool = False) -> int: return self._tokenizer.num_special_tokens_to_add(pair) def tokenize( self, text: TextInput, pair: Optional[TextInput] = None, add_special_tokens: bool = False ) -> List[str]: return self._tokenizer.encode(text, pair, add_special_tokens).tokens def batch_encode_plus( self, batch_text_or_text_pairs: Union[ List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair] ], add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, is_pretokenized: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_lengths: bool = False, **kwargs ) -> BatchEncoding: if not isinstance(batch_text_or_text_pairs, list): raise ValueError( "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs)) ) # Needed if we have to return a tensor pad_to_max_length = pad_to_max_length or (return_tensors is not None and len(batch_text_or_text_pairs) > 1) # Throw an error if we can pad because there is no padding token if pad_to_max_length and self.pad_token_id is None: raise ValueError("Unable to set proper padding strategy as the tokenizer does not have a padding token") # Set the truncation and padding strategy and restore the initial configuration with truncate_and_pad( tokenizer=self._tokenizer, max_length=max_length, stride=stride, strategy=truncation_strategy, pad_to_max_length=pad_to_max_length, padding_side=self.padding_side, pad_token_id=self.pad_token_id, pad_token_type_id=self.pad_token_type_id, pad_token=self._pad_token, ): # Check for the pretokenized path if is_pretokenized: encodings = [] # Iterate over each sample (we don't know yet if they are pairs or simple input for i, sample in enumerate(batch_text_or_text_pairs): if not isinstance(sample, (list, tuple)): raise TypeError( "batch_encode_plus(..., is_pretokenized=True) requires batch_text_or_text_pairs " "to be either List[List[str]] or List[Tuple[List[str], List[str]]] but sample at " "index {} is of type {}".format(i, type(sample)) ) # Test if we have a pair of sentences by checking the depth of nesting is_pair = bool(len(sample) > 0 and isinstance(sample[0], (list, tuple))) # Take care of the first sequence - we multi-thread over the words encodings_text = EncodingFast.merge( self._tokenizer.encode_batch(sample[0] if is_pair else sample, add_special_tokens=False), growing_offsets=True, ) # Take care of the second sequence if we have a pair if is_pair: encodings_pair = EncodingFast.merge( self._tokenizer.encode_batch([("", s) for s in sample[1]], add_special_tokens=False), growing_offsets=True, ) else: encodings_pair = None # Post-process - truncate/pad and add special tokens encoding = self._tokenizer.post_process(encodings_text, encodings_pair, add_special_tokens) encodings.append(encoding) # Classical path with strings input else: # Avoid thread overhead if only one example. if len(batch_text_or_text_pairs) == 1: if isinstance(batch_text_or_text_pairs[0], (tuple, list)): encodings = self._tokenizer.encode( *batch_text_or_text_pairs[0], add_special_tokens=add_special_tokens ) else: encodings = self._tokenizer.encode( batch_text_or_text_pairs[0], add_special_tokens=add_special_tokens ) encodings = [encodings] else: encodings = self._tokenizer.encode_batch( batch_text_or_text_pairs, add_special_tokens=add_special_tokens ) # Convert encoding to dict # `Tokens` has type: List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]] # with nested dimensions corresponding to batch, overflows, sequence length tokens = [ self._convert_encoding( encoding=encoding, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, ) for encoding in encodings ] # Sanitize the output to have dict[list] from list[dict] sanitized = {} for key in tokens[0].keys(): # To List[List[List[int]]] of shape (batch, overflows, sequence length) stack = [e for item in tokens for e in item[key]] if return_tensors == "tf": stack = tf.stack(stack, axis=0) elif return_tensors == "pt": stack = torch.stack(stack, dim=0) # elif not return_tensors and len(stack) == 1: # stack = stack[0] sanitized[key] = stack # If returning overflowing tokens, we need to return a mapping # from the batch idx to the original sample if return_overflowing_tokens: overflow_to_sample_mapping = flatten([[i] * len(enc["input_ids"]) for i, enc in enumerate(tokens)]) sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping return BatchEncoding(sanitized, encodings) def encode_plus( self, text: Union[TextInput, PreTokenizedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None, add_special_tokens: bool = True, max_length: Optional[int] = None, pad_to_max_length: bool = False, stride: int = 0, truncation_strategy: str = "longest_first", is_pretokenized: bool = False, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, **kwargs ) -> BatchEncoding: # Check for pretokenized path (ie [token1, token2, ..., tokenN] -> [id1, id2, ..., idN] if is_pretokenized: if isinstance(text, list) and len(text) > 0: # Encode through encode_batch with sequence of only one word which will be merged after hand encoding = self._tokenizer.encode_batch(text, add_special_tokens=False) encoding = EncodingFast.merge(encoding, growing_offsets=True) # Let's do the same for pairs if provided if isinstance(text_pair, list): # We prepend empty string before each word so that encoding is aware content is a pair encoding_pair = self._tokenizer.encode_batch( [("", p) for p in text_pair], add_special_tokens=False ) encoding_pair = EncodingFast.merge(encoding_pair, growing_offsets=True) elif text_pair is None: encoding_pair = None else: raise TypeError( "encode_plus(..., is_pretokenized=True) requires text and text_pair to be List[str] " "but got (text={}, text_pair={})".format(type(text), type(text_pair)) ) # Post process and if asked to do so, insert special tokens where needed encoding = self._tokenizer.post_process(encoding, encoding_pair, add_special_tokens) batched_output = BatchEncoding( self._convert_encoding( encoding, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, ), encoding, ) else: raise TypeError( "encode_plus(..., is_pretokenized=True) requires text to be List[str] " "but got (text={}, text_pair={})".format(type(text), type(text_pair)) ) else: batched_input = [(text, text_pair)] if text_pair else [text] batched_output = self.batch_encode_plus( batched_input, add_special_tokens=add_special_tokens, max_length=max_length, stride=stride, truncation_strategy=truncation_strategy, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, pad_to_max_length=pad_to_max_length, **kwargs, ) # Return tensor is None, then we can remove the leading batch axis if not return_tensors: batched_output = BatchEncoding( { key: value[0] if len(value) > 0 and isinstance(value[0], list) else value for key, value in batched_output.items() }, batched_output.encodings, ) return batched_output def decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True ) -> str: text = self._tokenizer.decode(token_ids, skip_special_tokens) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text else: return text def save_vocabulary(self, save_directory: str) -> Tuple[str]: if os.path.isdir(save_directory): files = self._tokenizer.save(save_directory) else: folder, file = os.path.split(os.path.abspath(save_directory)) files = self._tokenizer.save(folder, name=file) return tuple(files) def trim_batch( input_ids, pad_token_id, attention_mask=None, ): """Remove columns that are populated exclusively by pad_token_id""" keep_column_mask = input_ids.ne(pad_token_id).any(dim=0) if attention_mask is None: return input_ids[:, keep_column_mask] else: return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask]) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_xlm.py ================================================ # coding=utf-8 # Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for XLM.""" import json import logging import os import re import sys import unicodedata from typing import List, Optional import sacremoses as sm from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json", "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json", "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json", "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json", "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json", "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json", "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json", "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json", "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json", "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json", }, "merges_file": { "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt", "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt", "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt", "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt", "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt", "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "xlm-mlm-en-2048": 512, "xlm-mlm-ende-1024": 512, "xlm-mlm-enfr-1024": 512, "xlm-mlm-enro-1024": 512, "xlm-mlm-tlm-xnli15-1024": 512, "xlm-mlm-xnli15-1024": 512, "xlm-clm-enfr-1024": 512, "xlm-clm-ende-1024": 512, "xlm-mlm-17-1280": 512, "xlm-mlm-100-1280": 512, } PRETRAINED_INIT_CONFIGURATION = { "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True}, "xlm-mlm-ende-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "de", "1": "en"}, "lang2id": {"de": 0, "en": 1}, }, "xlm-mlm-enfr-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "en", "1": "fr"}, "lang2id": {"en": 0, "fr": 1}, }, "xlm-mlm-enro-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "en", "1": "ro"}, "lang2id": {"en": 0, "ro": 1}, }, "xlm-mlm-tlm-xnli15-1024": { "do_lowercase_and_remove_accent": True, "id2lang": { "0": "ar", "1": "bg", "2": "de", "3": "el", "4": "en", "5": "es", "6": "fr", "7": "hi", "8": "ru", "9": "sw", "10": "th", "11": "tr", "12": "ur", "13": "vi", "14": "zh", }, "lang2id": { "ar": 0, "bg": 1, "de": 2, "el": 3, "en": 4, "es": 5, "fr": 6, "hi": 7, "ru": 8, "sw": 9, "th": 10, "tr": 11, "ur": 12, "vi": 13, "zh": 14, }, }, "xlm-mlm-xnli15-1024": { "do_lowercase_and_remove_accent": True, "id2lang": { "0": "ar", "1": "bg", "2": "de", "3": "el", "4": "en", "5": "es", "6": "fr", "7": "hi", "8": "ru", "9": "sw", "10": "th", "11": "tr", "12": "ur", "13": "vi", "14": "zh", }, "lang2id": { "ar": 0, "bg": 1, "de": 2, "el": 3, "en": 4, "es": 5, "fr": 6, "hi": 7, "ru": 8, "sw": 9, "th": 10, "tr": 11, "ur": 12, "vi": 13, "zh": 14, }, }, "xlm-clm-enfr-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "en", "1": "fr"}, "lang2id": {"en": 0, "fr": 1}, }, "xlm-clm-ende-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "de", "1": "en"}, "lang2id": {"de": 0, "en": 1}, }, "xlm-mlm-17-1280": { "do_lowercase_and_remove_accent": False, "id2lang": { "0": "ar", "1": "de", "2": "en", "3": "es", "4": "fr", "5": "hi", "6": "it", "7": "ja", "8": "ko", "9": "nl", "10": "pl", "11": "pt", "12": "ru", "13": "sv", "14": "tr", "15": "vi", "16": "zh", }, "lang2id": { "ar": 0, "de": 1, "en": 2, "es": 3, "fr": 4, "hi": 5, "it": 6, "ja": 7, "ko": 8, "nl": 9, "pl": 10, "pt": 11, "ru": 12, "sv": 13, "tr": 14, "vi": 15, "zh": 16, }, }, "xlm-mlm-100-1280": { "do_lowercase_and_remove_accent": False, "id2lang": { "0": "af", "1": "als", "2": "am", "3": "an", "4": "ang", "5": "ar", "6": "arz", "7": "ast", "8": "az", "9": "bar", "10": "be", "11": "bg", "12": "bn", "13": "br", "14": "bs", "15": "ca", "16": "ceb", "17": "ckb", "18": "cs", "19": "cy", "20": "da", "21": "de", "22": "el", "23": "en", "24": "eo", "25": "es", "26": "et", "27": "eu", "28": "fa", "29": "fi", "30": "fr", "31": "fy", "32": "ga", "33": "gan", "34": "gl", "35": "gu", "36": "he", "37": "hi", "38": "hr", "39": "hu", "40": "hy", "41": "ia", "42": "id", "43": "is", "44": "it", "45": "ja", "46": "jv", "47": "ka", "48": "kk", "49": "kn", "50": "ko", "51": "ku", "52": "la", "53": "lb", "54": "lt", "55": "lv", "56": "mk", "57": "ml", "58": "mn", "59": "mr", "60": "ms", "61": "my", "62": "nds", "63": "ne", "64": "nl", "65": "nn", "66": "no", "67": "oc", "68": "pl", "69": "pt", "70": "ro", "71": "ru", "72": "scn", "73": "sco", "74": "sh", "75": "si", "76": "simple", "77": "sk", "78": "sl", "79": "sq", "80": "sr", "81": "sv", "82": "sw", "83": "ta", "84": "te", "85": "th", "86": "tl", "87": "tr", "88": "tt", "89": "uk", "90": "ur", "91": "uz", "92": "vi", "93": "war", "94": "wuu", "95": "yi", "96": "zh", "97": "zh_classical", "98": "zh_min_nan", "99": "zh_yue", }, "lang2id": { "af": 0, "als": 1, "am": 2, "an": 3, "ang": 4, "ar": 5, "arz": 6, "ast": 7, "az": 8, "bar": 9, "be": 10, "bg": 11, "bn": 12, "br": 13, "bs": 14, "ca": 15, "ceb": 16, "ckb": 17, "cs": 18, "cy": 19, "da": 20, "de": 21, "el": 22, "en": 23, "eo": 24, "es": 25, "et": 26, "eu": 27, "fa": 28, "fi": 29, "fr": 30, "fy": 31, "ga": 32, "gan": 33, "gl": 34, "gu": 35, "he": 36, "hi": 37, "hr": 38, "hu": 39, "hy": 40, "ia": 41, "id": 42, "is": 43, "it": 44, "ja": 45, "jv": 46, "ka": 47, "kk": 48, "kn": 49, "ko": 50, "ku": 51, "la": 52, "lb": 53, "lt": 54, "lv": 55, "mk": 56, "ml": 57, "mn": 58, "mr": 59, "ms": 60, "my": 61, "nds": 62, "ne": 63, "nl": 64, "nn": 65, "no": 66, "oc": 67, "pl": 68, "pt": 69, "ro": 70, "ru": 71, "scn": 72, "sco": 73, "sh": 74, "si": 75, "simple": 76, "sk": 77, "sl": 78, "sq": 79, "sr": 80, "sv": 81, "sw": 82, "ta": 83, "te": 84, "th": 85, "tl": 86, "tr": 87, "tt": 88, "uk": 89, "ur": 90, "uz": 91, "vi": 92, "war": 93, "wuu": 94, "yi": 95, "zh": 96, "zh_classical": 97, "zh_min_nan": 98, "zh_yue": 99, }, }, } def get_pairs(word): """ Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length strings) """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs def lowercase_and_remove_accent(text): """ Lowercase and strips accents from a piece of text based on https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py """ text = " ".join(text) text = text.lower() text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output).lower().split(" ") def replace_unicode_punct(text): """ Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl """ text = text.replace(",", ",") text = re.sub(r"。\s*", ". ", text) text = text.replace("、", ",") text = text.replace("”", '"') text = text.replace("“", '"') text = text.replace("∶", ":") text = text.replace(":", ":") text = text.replace("?", "?") text = text.replace("《", '"') text = text.replace("》", '"') text = text.replace(")", ")") text = text.replace("!", "!") text = text.replace("(", "(") text = text.replace(";", ";") text = text.replace("1", "1") text = text.replace("」", '"') text = text.replace("「", '"') text = text.replace("0", "0") text = text.replace("3", "3") text = text.replace("2", "2") text = text.replace("5", "5") text = text.replace("6", "6") text = text.replace("9", "9") text = text.replace("7", "7") text = text.replace("8", "8") text = text.replace("4", "4") text = re.sub(r".\s*", ". ", text) text = text.replace("~", "~") text = text.replace("’", "'") text = text.replace("…", "...") text = text.replace("━", "-") text = text.replace("〈", "<") text = text.replace("〉", ">") text = text.replace("【", "[") text = text.replace("】", "]") text = text.replace("%", "%") return text def remove_non_printing_char(text): """ Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl """ output = [] for char in text: cat = unicodedata.category(char) if cat.startswith("C"): continue output.append(char) return "".join(output) def romanian_preprocessing(text): """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`""" # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219") text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b") # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py text = text.replace("\u0218", "S").replace("\u0219", "s") # s-comma text = text.replace("\u021a", "T").replace("\u021b", "t") # t-comma text = text.replace("\u0102", "A").replace("\u0103", "a") text = text.replace("\u00C2", "A").replace("\u00E2", "a") text = text.replace("\u00CE", "I").replace("\u00EE", "i") return text class XLMTokenizer(PreTrainedTokenizer): """ BPE tokenizer for XLM - Moses preprocessing & tokenization for most supported languages - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP) - (optionally) lower case & normalize all inputs text - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ (ex: "__classify__") to a vocabulary - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): Vocabulary file. merges_file (:obj:`string`): Merges file. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to strip the text when tokenizing (removing excess spaces before and after the string). keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to keep accents when tokenizing. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["","","","","","","","","",""]`): List of additional special tokens. lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`): Dictionary mapping languages string identifiers to their IDs. id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`): Dictionary mapping language IDs to their string identifiers. do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase and remove accents when tokenizing. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, merges_file, unk_token="", bos_token="", sep_token="", pad_token="", cls_token="", mask_token="", additional_special_tokens=[ "", "", "", "", "", "", "", "", "", "", ], lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True, **kwargs ): super().__init__( unk_token=unk_token, bos_token=bos_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs, ) # cache of sm.MosesPunctNormalizer instance self.cache_moses_punct_normalizer = dict() # cache of sm.MosesTokenizer instance self.cache_moses_tokenizer = dict() self.lang_with_custom_tokenizer = set(["zh", "th", "ja"]) # True for current supported model (v1.2.0), False for XLM-17 & 100 self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent self.lang2id = lang2id self.id2lang = id2lang if lang2id is not None and id2lang is not None: assert len(lang2id) == len(id2lang) self.ja_word_tokenizer = None self.zh_word_tokenizer = None with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: merges = merges_handle.read().split("\n")[:-1] merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} def moses_punct_norm(self, text, lang): if lang not in self.cache_moses_punct_normalizer: punct_normalizer = sm.MosesPunctNormalizer(lang=lang) self.cache_moses_punct_normalizer[lang] = punct_normalizer else: punct_normalizer = self.cache_moses_punct_normalizer[lang] return punct_normalizer.normalize(text) def moses_tokenize(self, text, lang): if lang not in self.cache_moses_tokenizer: moses_tokenizer = sm.MosesTokenizer(lang=lang) self.cache_moses_tokenizer[lang] = moses_tokenizer else: moses_tokenizer = self.cache_moses_tokenizer[lang] return moses_tokenizer.tokenize(text, return_str=False, escape=False) def moses_pipeline(self, text, lang): text = replace_unicode_punct(text) text = self.moses_punct_norm(text, lang) text = remove_non_printing_char(text) return text def ja_tokenize(self, text): if self.ja_word_tokenizer is None: try: import Mykytea self.ja_word_tokenizer = Mykytea.Mykytea( "-model %s/local/share/kytea/model.bin" % os.path.expanduser("~") ) except (AttributeError, ImportError): logger.error( "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps" ) logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") logger.error("2. autoreconf -i") logger.error("3. ./configure --prefix=$HOME/local") logger.error("4. make && make install") logger.error("5. pip install kytea") raise return list(self.ja_word_tokenizer.getWS(text)) @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + "",) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token + "" while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = " ".join(word) if word == "\n ": word = "\n" self.cache[token] = word return word def _tokenize(self, text, lang="en", bypass_tokenizer=False): """ Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses. Details of tokenization: - [sacremoses](https://github.com/alvations/sacremoses): port of Moses - Install with `pip install sacremoses` - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer - Install with `pip install pythainlp` - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea) - Install with the following steps: ``` git clone git@github.com:neubig/kytea.git && cd kytea autoreconf -i ./configure --prefix=$HOME/local make && make install pip install kytea ``` - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*) - Install with `pip install jieba` (*) The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated. Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally, and set `bypass_tokenizer=True` to bypass the tokenizer. Args: - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it. - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE. Returns: List of tokens. """ if lang and self.lang2id and lang not in self.lang2id: logger.error( "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model." ) if bypass_tokenizer: text = text.split() elif lang not in self.lang_with_custom_tokenizer: text = self.moses_pipeline(text, lang=lang) # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step if lang == "ro": text = romanian_preprocessing(text) text = self.moses_tokenize(text, lang=lang) elif lang == "th": text = self.moses_pipeline(text, lang=lang) try: if "pythainlp" not in sys.modules: from pythainlp.tokenize import word_tokenize as th_word_tokenize else: th_word_tokenize = sys.modules["pythainlp"].word_tokenize except (AttributeError, ImportError): logger.error( "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps" ) logger.error("1. pip install pythainlp") raise text = th_word_tokenize(text) elif lang == "zh": try: if "jieba" not in sys.modules: import jieba else: jieba = sys.modules["jieba"] except (AttributeError, ImportError): logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps") logger.error("1. pip install jieba") raise text = " ".join(jieba.cut(text)) text = self.moses_pipeline(text, lang=lang) text = text.split() elif lang == "ja": text = self.moses_pipeline(text, lang=lang) text = self.ja_tokenize(text) else: raise ValueError("It should not reach here") if self.do_lowercase_and_remove_accent and not bypass_tokenizer: text = lowercase_and_remove_accent(text) split_tokens = [] for token in text: if token: split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = "".join(tokens).replace("", " ").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A XLM sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ bos = [self.bos_token_id] sep = [self.sep_token_id] if token_ids_1 is None: return bos + token_ids_0 + sep return bos + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0,)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License """ Tokenization classes for XLM-RoBERTa model.""" import logging import os from shutil import copyfile from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model", "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "xlm-roberta-base": 512, "xlm-roberta-large": 512, "xlm-roberta-large-finetuned-conll02-dutch": 512, "xlm-roberta-large-finetuned-conll02-spanish": 512, "xlm-roberta-large-finetuned-conll03-english": 512, "xlm-roberta-large-finetuned-conll03-german": 512, } class XLMRobertaTokenizer(PreTrainedTokenizer): """ Adapted from RobertaTokenizer and XLNetTokenizer SentencePiece based tokenizer. Peculiarities: - requires `SentencePiece `_ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", **kwargs ): super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # Original fairseq vocab and spm vocab must be "aligned": # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' # Mimic fairseq token-to-id alignment for the first 4 token self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab self.fairseq_offset = 1 self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A XLM-R sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-R does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] @property def vocab_size(self): return len(self.sp_model) + self.fairseq_offset + 1 # Add the token def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text): return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] spm_id = self.sp_model.PieceToId(token) # Need to return unknown token if the SP model returned 0 return spm_id + self.fairseq_offset if spm_id else self.unk_token_id def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/tokenization_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for XLNet model.""" import logging import os import unicodedata from shutil import copyfile from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model", "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "xlnet-base-cased": None, "xlnet-large-cased": None, } SPIECE_UNDERLINE = "▁" # Segments (not really needed) SEG_ID_A = 0 SEG_ID_B = 1 SEG_ID_CLS = 2 SEG_ID_SEP = 3 SEG_ID_PAD = 4 class XLNetTokenizer(PreTrainedTokenizer): """ Constructs an XLNet tokenizer. Based on `SentencePiece `__ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a .spm extension) that contains the vocabulary necessary to instantiate a tokenizer. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to strip the text when tokenizing (removing excess spaces before and after the string). keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to keep accents when tokenizing. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): Additional special tokens used by the tokenizer. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES padding_side = "left" def __init__( self, vocab_file, do_lower_case=False, remove_space=True, keep_accents=False, bos_token="", eos_token="", unk_token="", sep_token="", pad_token="", cls_token="", mask_token="", additional_special_tokens=["", ""], **kwargs ): super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs, ) self._pad_token_type_id = 3 try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return len(self.sp_model) def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): if self.remove_space: outputs = " ".join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if not self.keep_accents: outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if self.do_lower_case: outputs = outputs.lower() return outputs def _tokenize(self, text, sample=False): """ Tokenize a string. """ text = self.preprocess_text(text) if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: cur_pieces = cur_pieces[1:] else: cur_pieces[0] = cur_pieces[0][1:] cur_pieces.append(piece[-1]) new_pieces.extend(cur_pieces) else: new_pieces.append(piece) return new_pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An XLNet sequence has the following format: - single sequence: ``X `` - pair of sequences: ``A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return token_ids_0 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] return ([0] * len(token_ids_0)) + [1, 1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet sequence pair mask has the following format: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 | first sequence | second sequence | CLS segment ID if token_ids_1 is None, only returns the first portion of the mask (0's). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls_segment_id = [2] if token_ids_1 is None: return len(token_ids_0 + sep) * [0] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/trainer.py ================================================ import json import logging import math import os import random import re import shutil from contextlib import contextmanager from pathlib import Path from typing import Callable, Dict, List, Optional, Tuple import time import numpy as np import torch from packaging import version from torch import nn from torch.utils.data.dataloader import DataLoader from torch.utils.data.dataset import Dataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler from tqdm.auto import tqdm, trange from .data.data_collator import DataCollator, DefaultDataCollator from transformers.modeling_utils import PreTrainedModel from .optimization import AdamW from transformers import get_polynomial_decay_schedule_with_warmup#需要新版才有 from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput from .training_args import TrainingArguments, is_tpu_available try: from apex import amp _has_apex = True except ImportError: _has_apex = False def is_apex_available(): return _has_apex if is_tpu_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl try: from torch.utils.tensorboard import SummaryWriter _has_tensorboard = True except ImportError: try: from tensorboardX import SummaryWriter _has_tensorboard = True except ImportError: _has_tensorboard = False def is_tensorboard_available(): return _has_tensorboard try: import wandb wandb.ensure_configured() if wandb.api.api_key is None: _has_wandb = False wandb.termwarn("W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.") else: _has_wandb = False if os.getenv("WANDB_DISABLED") else True except ImportError: _has_wandb = False def is_wandb_available(): return _has_wandb logger = logging.getLogger(__name__) def set_seed(seed: int): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # ^^ safe to call this function even if cuda is not available @contextmanager def torch_distributed_zero_first(local_rank: int): """ Decorator to make all processes in distributed training wait for each local_master to do something. """ if local_rank not in [-1, 0]: torch.distributed.barrier() yield if local_rank == 0: torch.distributed.barrier() class SequentialDistributedSampler(Sampler): """ Distributed Sampler that subsamples indicies sequentially, making it easier to collate all results at the end. Even though we only use this sampler for eval and predict (no training), which means that the model params won't have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add extra samples to the sampler to make it evenly divisible (like in `DistributedSampler`) to make it easy to `gather` or `reduce` resulting tensors at the end of the loop. """ def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: if not torch.distributed.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = torch.distributed.get_world_size() if rank is None: if not torch.distributed.is_available(): raise RuntimeError("Requires distributed package to be available") rank = torch.distributed.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas def __iter__(self): indices = list(range(len(self.dataset))) # add extra samples to make it evenly divisible indices += indices[: (self.total_size - len(indices))] assert len(indices) == self.total_size # subsample indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples def get_tpu_sampler(dataset: Dataset): if xm.xrt_world_size() <= 1: return RandomSampler(dataset) return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) class Trainer: """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. """ model: PreTrainedModel args: TrainingArguments train_dataset: Optional[Dataset] eval_dataset: Optional[Dataset] compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None prediction_loss_only: bool tb_writer: Optional["SummaryWriter"] = None optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None global_step: Optional[int] = None epoch: Optional[float] = None def __init__( self, model: PreTrainedModel, args: TrainingArguments, train_dataLoader: Optional[DataLoader] = None, eval_dataLoader: Optional[DataLoader] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(args.device) self.args = args self.train_dataLoader = train_dataLoader self.eval_dataLoader = eval_dataLoader self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) if is_wandb_available(): self._setup_wandb() else: logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True) if is_tpu_available(): # Set an xla_device flag on the model's config. # We'll find a more elegant and not need to do this in the future. self.model.config.xla_device = True def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: # We use the same batch_size as for eval. if is_tpu_available(): sampler = SequentialDistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal() ) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) else: sampler = SequentialSampler(test_dataset) data_loader = DataLoader( test_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, ) return data_loader def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: """ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init, or override this method in a subclass. """ if self.optimizers is not None: return self.optimizers # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps,lr_end=self.args.lr_end ) return optimizer, scheduler def _setup_wandb(self): """ Setup the optional Weights & Biases (`wandb`) integration. One can override this method to customize the setup if needed. Find more information at https://docs.wandb.com/huggingface You can also override the following environment variables: Environment: WANDB_WATCH: (Optional, ["gradients", "all", "false"]) "gradients" by default, set to "false" to disable gradient logging or "all" to log gradients and parameters WANDB_PROJECT: (Optional): str - "huggingface" by default, set this to a custom string to store results in a different project WANDB_DISABLED: (Optional): boolean - defaults to false, set to "true" to disable wandb entirely """ logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"') wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) # keep track of model topology and gradients if os.getenv("WANDB_WATCH") != "false": wandb.watch( self.model, log=os.getenv("WANDB_WATCH", "gradients"), log_freq=max(100, self.args.logging_steps) ) def num_examples(self, dataloader: DataLoader) -> int: """ Helper to get num of examples from a DataLoader, by accessing its Dataset. """ return len(dataloader.dataset) def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.train_dataLoader if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 tqdmLoss=0#进度条的loss用滑动平均显示 beta_exp=1 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=True ) for epoch in train_iterator: last=time.time() if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True,ncols=70)#固定下长度,不然要换行 for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue now_loss=self._training_step(model, inputs, optimizer) tr_loss += now_loss #丰富进度条 tqdmLoss=tqdmLoss*0.99+(1-0.99)*now_loss#滑动平均下 beta_exp*=0.99#校正 epoch_iterator.set_description_str(f"epoch:{epoch+1}") epoch_iterator.set_postfix_str(f"loss:{round(tqdmLoss/(1-beta_exp),4)}") if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) logging_loss = tr_loss print()#log前要换行,不然和进度条挤在一起 self._log(logs) print() if self.args.evaluate_during_training: self.evaluate() if self.args.save_steps > 0 and self.global_step % self.args.save_steps==0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}-epoch-{int(self.epoch)}") self.save_model(output_dir) if self.is_world_master(): self._rotate_checkpoints() if is_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_master(): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break print(f"预训练第{epoch}轮耗时:",time.time()-last) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss / self.global_step) def _log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if self.epoch is not None: logs["epoch"] = self.epoch if self.tb_writer: for k, v in logs.items(): self.tb_writer.add_scalar(k, v, self.global_step) if is_wandb_available(): wandb.log(logs, step=self.global_step) output = json.dumps({**logs, **{"step": self.global_step}}) if iterator is not None: iterator.write(output) else: print(output) def _training_step( self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer ) -> float: model.train() for k, v in inputs.items(): inputs[k] = v.to(self.args.device) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers1 (see doc) if self.args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() return loss.item() def is_local_master(self) -> bool: if is_tpu_available(): return xm.is_master_ordinal(local=True) else: return self.args.local_rank in [-1, 0] def is_world_master(self) -> bool: """ This will be True only in one process, even in distributed mode, even when training on multiple machines. """ if is_tpu_available(): return xm.is_master_ordinal(local=False) else: return self.args.local_rank == -1 or torch.distributed.get_rank() == 0 def save_model(self, output_dir: Optional[str] = None): """ Saving best-practices: if you use default names for the model, you can reload it using from_pretrained(). Will only save from the world_master process (unless in TPUs). """ if is_tpu_available(): self._save_tpu(output_dir) elif self.is_world_master(): self._save(output_dir) def _save_tpu(self, output_dir: Optional[str] = None): output_dir = output_dir if output_dir is not None else self.args.output_dir logger.info("Saving model checkpoint to %s", output_dir) if xm.is_master_ordinal(): os.makedirs(output_dir, exist_ok=True) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, PreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") xm.rendezvous("saving_checkpoint") self.model.save_pretrained(output_dir) def _save(self, output_dir: Optional[str] = None): output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", output_dir) # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, PreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") self.model.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, "training_args.bin")) def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]: ordering_and_checkpoint_path = [] glob_checkpoints = [str(x) for x in Path(self.args.output_dir).glob(f"{checkpoint_prefix}-*")] for path in glob_checkpoints: if use_mtime: ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) else: regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path) if regex_match and regex_match.groups(): ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) checkpoints_sorted = sorted(ordering_and_checkpoint_path) checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] return checkpoints_sorted def _rotate_checkpoints(self, use_mtime=False) -> None: if self.args.save_total_limit is None or self.args.save_total_limit <= 0: return # Check if we should delete older checkpoint(s) checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime) if len(checkpoints_sorted) <= self.args.save_total_limit: return number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args.save_total_limit) checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: curEpoch = checkpoint.split('-')[-1] print(checkpoint,curEpoch) if int(curEpoch) % 50 == 0: continue logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) shutil.rmtree(checkpoint) def evaluate( self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None, ) -> Dict[str, float]: """ Run evaluation and return metrics. The calling script will be responsible for providing a method to compute metrics, as they are task-dependent. Args: eval_dataset: (Optional) Pass a dataset if you wish to override the one on the instance. Returns: A dict containing: - the eval loss - the potential metrics computed from the predictions """ eval_dataloader = self.eval_dataLoader output = self._prediction_loop(eval_dataloader, description="Evaluation") self._log(output.metrics) if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) return output.metrics def predict(self, test_dataset: Dataset) -> PredictionOutput: """ Run prediction and return predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in evaluate(). """ test_dataloader = self.get_test_dataloader(test_dataset) return self._prediction_loop(test_dataloader, description="Prediction") def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() if is_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) for inputs in tqdm(dataloader, desc=description): has_labels = any(inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: if preds is None: preds = logits.detach() else: preds = torch.cat((preds, logits.detach()), dim=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach() else: label_ids = torch.cat((label_ids, inputs["labels"].detach()), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) elif is_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) def distributed_concat(self, tensor: torch.Tensor, num_total_examples: int) -> torch.Tensor: assert self.args.local_rank != -1 output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())] torch.distributed.all_gather(output_tensors, tensor) concat = torch.cat(output_tensors, dim=0) # truncate the dummy elements added by SequentialDistributedSampler output = concat[:num_total_examples] return output ================================================ FILE: code/bert-base-count3/pretrain/transformers1/trainer_tf.py ================================================ """Tensorflow trainer class.""" import logging import math import os from typing import Callable, Dict, Optional import numpy as np import tensorflow as tf from .modeling_tf_utils import TFPreTrainedModel, shape_list from .optimization_tf import GradientAccumulator, create_optimizer from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput from .training_args_tf import TFTrainingArguments logger = logging.getLogger(__name__) class TFTrainer: model: TFPreTrainedModel args: TFTrainingArguments # something similar to a PT Dataset. # This is just temporary before to have # a framework-agnostic approach for datasets. train_dataset: Optional[tf.data.Dataset] eval_dataset: Optional[tf.data.Dataset] compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None prediction_loss_only: bool def __init__( self, model: TFPreTrainedModel, args: TFTrainingArguments, train_dataset: Optional[tf.data.Dataset] = None, eval_dataset: Optional[tf.data.Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, ): self.model = model self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.gradient_accumulator = GradientAccumulator() self._setup_training() def _setup_training(self) -> None: """ Setup the different steps to train a model: - check if all the data are given - create the proper strategy - create the features - prepare the model settings """ self._prepare_dataset() with self.args.strategy.scope(): self._create_optimizer() _ = self.optimizer.iterations self._set_loss_and_metric() self._create_checkpoint_manager() self._create_summary_writer() def _set_loss_and_metric(self) -> None: """ Create the training loss and metric with their name. Allowed names are those listed in the Tensorflow documentation and those contained in the transformers1 library. """ try: self.loss = tf.keras.losses.get( { "class_name": self.args.loss_name, "config": {"from_logits": True, "reduction": tf.keras.losses.Reduction.NONE}, } ) except TypeError: self.loss = tf.keras.losses.get( {"class_name": self.args.loss_name, "config": {"reduction": tf.keras.losses.Reduction.NONE}} ) def _create_summary_writer(self) -> None: """ Create a summary writer to be able to read the logs in Tensorboard. """ self.writer = tf.summary.create_file_writer(self.args.logging_dir) def _prepare_dataset(self) -> None: """ Prepare the training, validation and test data. """ if self.train_dataset is not None: self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy() if self.args.max_steps > 0: self.train_steps = self.args.max_steps else: self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size) self.train_dataset = ( self.train_dataset.cache() .shuffle(self.num_train_examples) .batch(self.args.train_batch_size) .prefetch(tf.data.experimental.AUTOTUNE) ) if self.args.max_steps > 0: self.train_dataset = self.train_dataset.repeat(-1) self.train_dataset = self.args.strategy.experimental_distribute_dataset(self.train_dataset) else: self.train_steps = 0 if self.eval_dataset is not None: self.eval_dataset = ( self.eval_dataset.batch(self.args.eval_batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE) ) self.eval_dataset = self.args.strategy.experimental_distribute_dataset(self.eval_dataset) def _create_optimizer(self) -> None: """ Create the training optimizer with its name. Allowed names are those listed in the Tensorflow documentation and those contained in the transformers1 library. """ if self.args.optimizer_name == "adamw": self.optimizer = create_optimizer( self.args.learning_rate, self.train_steps, self.args.warmup_steps, self.args.end_lr ) else: try: self.optimizer = tf.keras.optimizers.get( { "class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate, "epsilon": self.args.adam_epsilon}, } ) except TypeError: # This is for the case where the optimizer is not Adam-like such as SGD self.optimizer = tf.keras.optimizers.get( {"class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate}} ) logger.info("Created an/a {} optimizer".format(self.args.optimizer_name)) def _create_checkpoint_manager(self, max_to_keep: int = 5, load_model: bool = True) -> None: """ Create a checkpoint manager in order to be able to make the training fault-tolerant. Args: max_to_keep: the maximum number of checkpoints to keep in the checkpoint path. load_model: if we want to start the training from the latest checkpoint. """ ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=max_to_keep) if load_model: ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial() @tf.function def _evaluate_steps(self, per_replica_features, per_replica_labels): """ One step evaluation across replica. Args: per_replica_features: the batched features. per_replica_labels: the batched labels. Returns: The loss corresponding to the given batch. """ per_replica_loss, per_replica_logits = self.args.strategy.experimental_run_v2( self._run_model, args=(per_replica_features, per_replica_labels, False) ) try: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) except ValueError: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) return reduced_loss, per_replica_logits def _prediction_loop( self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: logger.info("***** Running %s *****", description) logger.info(" Batch size = %d", self.args.eval_batch_size) label_ids: np.ndarray = None preds: np.ndarray = None step: int = 1 for features, labels in dataset: step = tf.convert_to_tensor(step, dtype=tf.int64) loss, logits = self._evaluate_steps(features, labels) loss = tf.reduce_mean(loss) if not prediction_loss_only: if self.args.n_gpu > 1: for val in logits.values: if preds is None: preds = val.numpy() else: preds = np.append(preds, val.numpy(), axis=0) for val in labels.values: if label_ids is None: label_ids = val.numpy() else: label_ids = np.append(label_ids, val.numpy(), axis=0) else: if preds is None: preds = logits.numpy() else: preds = np.append(preds, logits.numpy(), axis=0) if label_ids is None: label_ids = labels.numpy() else: label_ids = np.append(label_ids, labels.numpy(), axis=0) step += 1 if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} metrics["eval_loss"] = loss.numpy() for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) def evaluate( self, eval_dataset: Optional[tf.data.Dataset] = None, prediction_loss_only: Optional[bool] = None ) -> Dict[str, float]: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. """ if eval_dataset is None: eval_dataset = self.eval_dataset output = self._prediction_loop(eval_dataset, description="Evaluation") return output.metrics def train(self) -> None: """ Train method to train the model. """ if self.args.debug: tf.summary.trace_on(graph=True, profiler=True) self.gradient_accumulator.reset() iterations = self.optimizer.iterations if iterations.numpy() > 0: logger.info("Start the training from the last checkpoint") start_epoch = (iterations.numpy() // self.train_steps) + 1 else: start_epoch = 1 tf.summary.experimental.set_step(iterations) epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_train_examples) logger.info(" Num Epochs = %d", epochs) logger.info(" Total optimization steps = %d", self.train_steps) for epoch in range(start_epoch, int(epochs + 1)): for training_loss in self._training_steps(): step = iterations.numpy() if self.args.debug: with self.writer.as_default(): tf.summary.scalar("loss", training_loss, step=step) if step == 1 and self.args.debug: with self.writer.as_default(): tf.summary.trace_export(name="training", step=step, profiler_outdir=self.args.logging_dir) if self.args.evaluate_during_training and step % self.args.eval_steps == 0: logs = {} results = self.evaluate() for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value if callable(self.optimizer.learning_rate): logs["learning_rate"] = self.optimizer.learning_rate(step).numpy() else: logs["learning_rate"] = self.optimizer.learning_rate.numpy() logger.info("Epoch {} Step {} Validation Metrics {}".format(epoch, step, logs)) with self.writer.as_default(): for k, v in logs.items(): tf.summary.scalar(k, v, step=step) if step % self.args.logging_steps == 0: logger.info("Epoch {} Step {} Train Loss {:.4f}".format(epoch, step, training_loss.numpy())) if step % self.args.save_steps == 0: ckpt_save_path = self.model.ckpt_manager.save() logger.info("Saving checkpoint for step {} at {}".format(step, ckpt_save_path)) if step % self.train_steps == 0: break def _training_steps(self): """ Returns a generator over training steps (i.e. parameters update). """ for i, loss in enumerate(self._accumulate_next_gradients()): if i % self.args.gradient_accumulation_steps == 0: self._apply_gradients() yield loss @tf.function def _apply_gradients(self): """Applies the gradients (cross-replica).""" self.args.strategy.experimental_run_v2(self._step) def _step(self): """Applies gradients and resets accumulation.""" gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync gradients = [ gradient / tf.cast(gradient_scale, gradient.dtype) for gradient in self.gradient_accumulator.gradients ] gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients] self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) self.gradient_accumulator.reset() def _accumulate_next_gradients(self): """Accumulates the gradients from the next element in dataset.""" iterator = iter(self.train_dataset) @tf.function def _accumulate_next(): per_replica_features, per_replica_labels = next(iterator) return self._accumulate_gradients(per_replica_features, per_replica_labels) while True: try: yield _accumulate_next() except tf.errors.OutOfRangeError: break def _accumulate_gradients(self, per_replica_features, per_replica_labels): """Accumulates the gradients across all the replica.""" per_replica_loss = self.args.strategy.experimental_run_v2( self._forward, args=(per_replica_features, per_replica_labels) ) try: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) except ValueError: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) return reduced_loss def _forward(self, features, labels): """Forwards a training example and accumulates the gradients.""" per_example_loss, _ = self._run_model(features, labels, True) gradients = tf.gradients(per_example_loss, self.model.trainable_variables) gradients = [ g if g is not None else tf.zeros_like(v) for g, v in zip(gradients, self.model.trainable_variables) ] self.gradient_accumulator(gradients) return per_example_loss def _run_model(self, features, labels, training): """ Computes the loss of the given features and labels pair. Args: features: the batched features. labels: the batched labels. training: run the model in training mode or not """ if self.args.mode == "text-classification" or self.args.mode == "token-classification": logits = self.model(features, training=training)[0] else: logits = self.model(features, training=training) if self.args.mode == "token-classification": active_loss = tf.reshape(labels, (-1,)) != -1 reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) loss = self.loss(labels, reduced_logits) elif self.args.mode == "question-answering": start_loss = self.loss(labels["start_position"], logits[0]) end_loss = self.loss(labels["end_position"], logits[1]) loss = (start_loss + end_loss) / 2.0 else: loss = self.loss(labels, logits) loss += sum(self.model.losses) * (1.0 / self.args.n_gpu) return loss, logits def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput: """ Run prediction and return predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in evaluate(). Args: test_dataset: something similar to a PT Dataset. This is just temporary before to have a framework-agnostic approach for datasets. """ test_dataset = test_dataset.batch(self.args.eval_batch_size) test_dataset = self.args.strategy.experimental_distribute_dataset(test_dataset) return self._prediction_loop(test_dataset, description="Prediction") def save_model(self) -> None: """ Save the pretrained model and create a Tensorflow saved model. """ logger.info("Saving model in {}".format(self.args.output_dir)) path = os.path.join(self.args.output_dir, "saved_model") logger.info("Saving model in {}".format(path)) os.makedirs(path, exist_ok=True) self.model.save_pretrained(self.args.output_dir) ================================================ FILE: code/bert-base-count3/pretrain/transformers1/trainer_utils.py ================================================ from typing import Dict, NamedTuple, Optional import numpy as np class EvalPrediction(NamedTuple): """ Evaluation output (always contains labels), to be used to compute metrics. """ predictions: np.ndarray label_ids: np.ndarray class PredictionOutput(NamedTuple): predictions: np.ndarray label_ids: Optional[np.ndarray] metrics: Optional[Dict[str, float]] class TrainOutput(NamedTuple): global_step: int training_loss: float PREFIX_CHECKPOINT_DIR = "checkpoint" ================================================ FILE: code/bert-base-count3/pretrain/transformers1/training_args.py ================================================ import dataclasses import json import logging from dataclasses import dataclass, field from typing import Any, Dict, Optional, Tuple from .file_utils import cached_property, is_torch_available, torch_required if is_torch_available(): import torch try: import torch_xla.core.xla_model as xm _has_tpu = True except ImportError: _has_tpu = False @torch_required def is_tpu_available(): return _has_tpu logger = logging.getLogger(__name__) @dataclass class TrainingArguments: """ TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop itself**. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ output_dir: str = field( metadata={"help": "The output directory where the model predictions and checkpoints will be written."} ) overwrite_output_dir: bool = field( default=False, metadata={ "help": ( "Overwrite the content of the output directory." "Use this to continue training if output_dir points to a checkpoint directory." ) }, ) do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."}) do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) evaluate_during_training: bool = field( default=False, metadata={"help": "Run evaluation during training at each logging step."}, ) per_device_train_batch_size: int = field( default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."} ) per_device_eval_batch_size: int = field( default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."} ) per_gpu_train_batch_size: Optional[int] = field( default=None, metadata={ "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. " "Batch size per GPU/TPU core/CPU for training." }, ) per_gpu_eval_batch_size: Optional[int] = field( default=None, metadata={ "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred." "Batch size per GPU/TPU core/CPU for evaluation." }, ) gradient_accumulation_steps: int = field( default=1, metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, ) learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."}) lr_end: float = field(default=1e-5, metadata={"help": "学习率最后衰减到多少."}) weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."}) adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."}) max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."}) max_steps: int = field( default=-1, metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."}, ) warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."}) logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"}) logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) save_total_limit: Optional[int] = field( default=None, metadata={ "help": ( "Limit the total amount of checkpoints." "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" ) }, ) no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"}) seed: int = field(default=42, metadata={"help": "random seed for initialization"}) fp16: bool = field( default=False, metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"}, ) fp16_opt_level: str = field( default="O1", metadata={ "help": ( "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html" ) }, ) local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) tpu_num_cores: Optional[int] = field( default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"} ) tpu_metrics_debug: bool = field(default=False, metadata={"help": "TPU: Whether to print debug metrics"}) @property def train_batch_size(self) -> int: if self.per_gpu_train_batch_size: logger.warning( "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " "version. Using `--per_device_train_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size return per_device_batch_size * max(1, self.n_gpu) @property def eval_batch_size(self) -> int: if self.per_gpu_eval_batch_size: logger.warning( "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " "version. Using `--per_device_eval_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size return per_device_batch_size * max(1, self.n_gpu) @cached_property @torch_required def _setup_devices(self) -> Tuple["torch.device", int]: logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") n_gpu = 0 elif is_tpu_available(): device = xm.xla_device() n_gpu = 0 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) n_gpu = 1 return device, n_gpu @property @torch_required def device(self) -> "torch.device": return self._setup_devices[0] @property @torch_required def n_gpu(self): return self._setup_devices[1] def to_json_string(self): """ Serializes this instance to a JSON string. """ return json.dumps(dataclasses.asdict(self), indent=2) def to_sanitized_dict(self) -> Dict[str, Any]: """ Sanitized serialization to use with TensorBoard’s hparams """ d = dataclasses.asdict(self) valid_types = [bool, int, float, str] if is_torch_available(): valid_types.append(torch.Tensor) return {k: v if type(v) in valid_types else str(v) for k, v in d.items()} ================================================ FILE: code/bert-base-count3/pretrain/transformers1/training_args_tf.py ================================================ import logging from dataclasses import dataclass, field from typing import Tuple from .file_utils import cached_property, is_tf_available, tf_required from .training_args import TrainingArguments logger = logging.getLogger(__name__) if is_tf_available(): import tensorflow as tf @dataclass class TFTrainingArguments(TrainingArguments): optimizer_name: str = field( default="adam", metadata={ "help": 'Name of a Tensorflow optimizer among "adadelta, adagrad, adam, adamax, ftrl, nadam, rmsprop, sgd, adamw"' }, ) mode: str = field( default="text-classification", metadata={"help": 'Type of task, one of "text-classification", "token-classification", "question-answering"'}, ) loss_name: str = field( default="SparseCategoricalCrossentropy", metadata={ "help": "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses" }, ) tpu_name: str = field( default=None, metadata={"help": "Name of TPU"}, ) end_lr: float = field( default=0, metadata={"help": "End learning rate for optimizer"}, ) eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."}) debug: bool = field( default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"} ) @cached_property @tf_required def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]: logger.info("Tensorflow: setting up strategy") gpus = tf.config.list_physical_devices("GPU") if self.no_cuda: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: try: if self.tpu_name: tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name) else: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() except ValueError: tpu = None if tpu: tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) elif len(gpus) == 0: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") elif len(gpus) == 1: strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") elif len(gpus) > 1: # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` strategy = tf.distribute.MirroredStrategy() else: raise ValueError("Cannot find the proper strategy please check your environment properties.") return strategy @property @tf_required def strategy(self) -> "tf.distribute.Strategy": return self._setup_strategy @property @tf_required def n_gpu(self) -> int: return self._setup_strategy.num_replicas_in_sync ================================================ FILE: code/bert-base-count3/pretrain/transformers1/try.py ================================================ from transformers import TFAlbertForMaskedLM, TFAlbertModel, TFAlbertForSequenceClassification, AlbertForMaskedLM import os checkpoint = "albert-base-v1" model = AlbertForMaskedLM.from_pretrained(checkpoint) if not os.path.exists("~/saved/" + checkpoint): os.makedirs("~/saved/" + checkpoint) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint, from_pt=True) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertModel.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForSequenceClassification.from_pretrained('~/saved/' + checkpoint) print("nice model") ================================================ FILE: code/bert-base-count3/pretrain/transformers1/utils_encoder_decoder.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Classes to support Encoder-Decoder architectures """ def prepare_encoder_decoder_model_kwargs(**kwargs): """ Prepare the encoder and decoder's keyword arguments. Keyword arguments come in 3 flavors: - encoder-specific (prefixed by `encoder_`) - decoder-specific (prefixed by `decoder_`) - those that apply to the model as whole. We let the specific kwargs override the common ones in case of conflict. """ kwargs_common = { argument: value for argument, value in kwargs.items() if not argument.startswith("encoder_") and not argument.startswith("decoder_") } if "input_ids" in kwargs_common: kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids") decoder_kwargs = kwargs_common.copy() encoder_kwargs = kwargs_common.copy() encoder_kwargs.update( {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")} ) decoder_kwargs.update( {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")} ) decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None) return encoder_kwargs, decoder_kwargs ================================================ FILE: code/bert-base-count3-len100/finetuning/.ipynb_checkpoints/PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved. \n", "Licensed under the MIT License." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inference PyTorch Bert Model with ONNX Runtime on GPU" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutorial, you'll learn how to load a Bert model from PyTorch, convert it to ONNX, and inference it for high performance using ONNX Runtime and NVIDIA GPU. In the following sections, we are going to use the Bert model trained with Stanford Question Answering Dataset (SQuAD) dataset as an example. Bert SQuAD model is used in question answering scenarios, where the answer to every question is a segment of text from the corresponding reading passage, or the question might be unanswerable.\n", "\n", "This notebook is for GPU inference. For CPU inference, please look at another notebook [Inference PyTorch Bert Model with ONNX Runtime on CPU](PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Prerequisites ##\n", "It requires your machine to have a GPU, and a python environment with [PyTorch](https://pytorch.org/) installed before running this notebook.\n", "\n", "#### GPU Environment Setup using AnaConda\n", "\n", "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n", "\n", "```console\n", "conda create -n gpu_env python=3.7\n", "conda activate gpu_env\n", "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n", "conda install -c anaconda ipykernel\n", "conda install -c conda-forge ipywidgets\n", "python -m ipykernel install --user --name=gpu_env_py37\n", "jupyter notebook\n", "```\n", "Finally, launch Jupyter Notebook and you can choose gpu_env_py37 as kernel to run this notebook.\n", "\n", "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the corresponding version in [requirements](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements). If the version is different from above cudatoolkit version, you have to install them separately, and add their bin directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Skipping onnxruntime-gpu as it is not installed.\u001b[0m\r\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip uninstall --quiet --yes onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet --upgrade transformers\n", "!{sys.executable} -m pip install --quiet --upgrade onnxconverter_common\n", "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n", "!{sys.executable} -m pip install --quiet wget netron pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Load Pretrained Bert model ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We begin by downloading the SQuAD data file and store them in the specified location. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "cache_dir = \"./squad\"\n", "if not os.path.exists(cache_dir):\n", " os.makedirs(cache_dir)\n", "\n", "predict_file_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json\"\n", "predict_file = os.path.join(cache_dir, \"dev-v1.1.json\")\n", "if not os.path.exists(predict_file):\n", " import wget\n", " print(\"Start downloading predict file.\")\n", " wget.download(predict_file_url, predict_file)\n", " print(\"Predict file downloaded.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's first define some constant variables." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Whether allow overwriting existing ONNX model and download the latest script from GitHub\n", "enable_overwrite = True\n", "\n", "# Total samples to inference, so that we can get average latency\n", "total_samples = 1000\n", "\n", "# ONNX opset version\n", "opset_version=11" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Specify some model configuration variables." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# For fine-tuned large model, the model name is \"bert-large-uncased-whole-word-masking-finetuned-squad\". Here we use bert-base for demo.\n", "model_name_or_path = \"bert-base-cased\"\n", "max_seq_length = 128\n", "doc_stride = 128\n", "max_query_length = 64" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Start to load model from pretrained. This step could take a few minutes. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 48/48 [00:04<00:00, 11.28it/s]\n", "convert squad examples to features: 100%|██████████| 1000/1000 [00:09<00:00, 102.15it/s]\n", "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 161306.98it/s]\n" ] } ], "source": [ "# The following code is adapted from HuggingFace transformers\n", "# https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n", "\n", "from transformers import (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "\n", "# Load pretrained model and tokenizer\n", "config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n", "tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n", "model = model_class.from_pretrained(model_name_or_path,\n", " from_tf=False,\n", " config=config,\n", " cache_dir=cache_dir)\n", "# load some examples\n", "from transformers.data.processors.squad import SquadV1Processor\n", "\n", "processor = SquadV1Processor()\n", "examples = processor.get_dev_examples(None, filename=predict_file)\n", "\n", "from transformers import squad_convert_examples_to_features\n", "features, dataset = squad_convert_examples_to_features( \n", " examples=examples[:total_samples], # convert enough examples for this notebook\n", " tokenizer=tokenizer,\n", " max_seq_length=max_seq_length,\n", " doc_stride=doc_stride,\n", " max_query_length=max_query_length,\n", " is_training=False,\n", " return_dataset='pt'\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Export the loaded model ##\n", "Once the model is loaded, we can export the loaded PyTorch model to ONNX." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model exported at ./onnx/bert-base-cased-squad_opset11.onnx\n" ] } ], "source": [ "output_dir = \"./onnx\"\n", "if not os.path.exists(output_dir):\n", " os.makedirs(output_dir) \n", "export_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opset{}.onnx'.format(opset_version))\n", "\n", "import torch\n", "use_gpu = torch.cuda.is_available()\n", "device = torch.device(\"cuda\" if use_gpu else \"cpu\")\n", "\n", "# Get the first example data to run the model and export it to ONNX\n", "data = dataset[0]\n", "inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", "}\n", "\n", "# Set model to inference mode, which is required before exporting the model because some operators behave differently in \n", "# inference and training mode.\n", "model.eval()\n", "model.to(device)\n", "\n", "if enable_overwrite or not os.path.exists(export_model_path):\n", " with torch.no_grad():\n", " symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}\n", " torch.onnx.export(model, # model being run\n", " args=tuple(inputs.values()), # model input (or a tuple for multiple inputs)\n", " f=export_model_path, # where to save the model (can be a file or file-like object)\n", " opset_version=opset_version, # the ONNX version to export the model to\n", " do_constant_folding=True, # whether to execute constant folding for optimization\n", " input_names=['input_ids', # the model's input names\n", " 'input_mask', \n", " 'segment_ids'],\n", " output_names=['start', 'end'], # the model's output names\n", " dynamic_axes={'input_ids': symbolic_names, # variable length axes\n", " 'input_mask' : symbolic_names,\n", " 'segment_ids' : symbolic_names,\n", " 'start' : symbolic_names,\n", " 'end' : symbolic_names})\n", " print(\"Model exported at \", export_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. PyTorch Inference ##\n", "Use PyTorch to evaluate an example input for comparison purpose." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PyTorch cuda Inference time = 16.57 ms\n" ] } ], "source": [ "import time\n", "\n", "# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.\n", "latency = []\n", "with torch.no_grad():\n", " for i in range(total_samples):\n", " data = dataset[i]\n", " inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", " }\n", " start = time.time()\n", " outputs = model(**inputs)\n", " latency.append(time.time() - start)\n", "print(\"PyTorch {} Inference time = {} ms\".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Inference ONNX Model with ONNX Runtime ##\n", "\n", "### CUDA and cuDNN Path\n", "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn):\n", "\n", "* [onnxruntime-gpu v1.3.0](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "* [onnxruntime-gpu v1.2.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.2.0) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "\n", "During installing PyTorch 1.5, we installed cudatoolkit 10.1.243 in this conda environment. That shall be good for onnxruntime-gpu 1.3.0 in Jupyter Notebook." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n", "add_cuda_path = False\n", "\n", "if add_cuda_path:\n", " # Add path of CUDA 10.0 and CUDNN 7.6 for onnxruntime-gpu 1.0.0 ~ 1.1.2\n", " cuda_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " cudnn_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n", " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n", " else:\n", " if cuda_dir == cudnn_dir:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n", " else:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### OpenMP Environment Variable\n", "\n", "OpenMP environment variables are optional for GPU inference of standard Bert model. It has little performance impact on Bert model since most nodes are executed in GPU. \n", "\n", "You can find the best setting based on [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n", "\n", "**Attention: Setting environment variables shall be done before importing onnxruntime**. Otherwise, they might not take effect." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Optional. You can change them according to Performance Test Tool result.\n", "#os.environ[\"OMP_NUM_THREADS\"] = '1'\n", "#os.environ[\"OMP_WAIT_POLICY\"] = 'PASSIVE'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are ready to inference the model with ONNX Runtime." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OnnxRuntime gpu Inference time = 4.43 ms\n" ] } ], "source": [ "import psutil\n", "import onnxruntime\n", "import numpy\n", "\n", "assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()\n", "device_name = 'gpu'\n", "\n", "sess_options = onnxruntime.SessionOptions()\n", "\n", "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n", "# Note that this will increase session creation time so enable it for debugging only.\n", "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_{}.onnx\".format(device_name))\n", "\n", "# Please change the value according to best setting in Performance Test Tool result.\n", "sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n", "\n", "session = onnxruntime.InferenceSession(export_model_path, sess_options)\n", "\n", "latency = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n", " ort_inputs = {\n", " 'input_ids': data[0].cpu().reshape(1, max_seq_length).numpy(),\n", " 'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n", " 'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()\n", " }\n", " start = time.time()\n", " ort_outputs = session.run(None, ort_inputs)\n", " latency.append(time.time() - start)\n", " \n", "print(\"OnnxRuntime {} Inference time = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can compare the output of PyTorch and ONNX Runtime. We can see some results are not close. It is because ONNX Runtime uses some approximation in CUDA optimization. Based on our evaluation on SQuAD data set, F1 score is on par for models before and after optimization." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Verifying correctness *****\n", "PyTorch and ONNX Runtime output 0 are close: True\n", "maximum_diff=9.499490261077881e-07 average_diff=1.4225952327251434e-07\n", "PyTorch and ONNX Runtime output 1 are close: True\n", "maximum_diff=6.92903995513916e-07 average_diff=1.2441887520253658e-07\n" ] } ], "source": [ "print(\"***** Verifying correctness *****\")\n", "for i in range(2): \n", " print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[i], outputs[i].cpu(), rtol=1e-02, atol=1e-02))\n", " diff = ort_outputs[i] - outputs[i].cpu().numpy()\n", " max_diff = numpy.max(numpy.abs(diff))\n", " avg_diff = numpy.average(numpy.abs(diff))\n", " print(f'maximum_diff={max_diff} average_diff={avg_diff}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inference with Actual Sequence Length\n", "Note that ONNX model is exported using dynamic length axis. It is recommended to use actual sequence input without padding instead of fixed length input for best performance. Let's see how it can be applied to this model.\n", "\n", "From an example input below, we can see zero padding at the end of each sequence." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[ 101, 1293, 1242, 2557, 1127, 1226, 1104, 1103, 3613, 16429,\n", " 5235, 136, 102, 3613, 16429, 5988, 170, 107, 1353, 1671,\n", " 1992, 1342, 107, 5235, 117, 1107, 1134, 1473, 3683, 3538,\n", " 1125, 170, 1476, 118, 1248, 2595, 4086, 1714, 1104, 2965,\n", " 15897, 1104, 3613, 16429, 119, 1473, 3683, 3538, 3222, 1149,\n", " 2551, 1168, 23759, 1116, 1121, 1506, 1103, 10280, 2231, 1111,\n", " 1103, 1714, 16355, 119, 102, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]],\n", " device='cuda:0'),\n", " 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n", " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# An example input (we can see padding). From attention_mask, we can deduce the actual length.\n", "inputs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The original sequence length is 128. After removing paddings, the sequence length is reduced. Input with smaller sequence length need less computation, thus we can see there is improvement on inference latency. " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average length 101\n", "OnnxRuntime gpu Inference time with actual sequence length = 4.23 ms\n" ] } ], "source": [ "import statistics\n", "\n", "latency = []\n", "lengths = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # Instead of using fixed length (128), we can use actual sequence length (less than 128), which helps to get better performance.\n", " actual_sequence_length = sum(data[1].numpy())\n", " lengths.append(actual_sequence_length)\n", " opt_inputs = {\n", " 'input_ids': data[0].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'input_mask': data[1].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'segment_ids': data[2].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length)\n", " }\n", " start = time.time()\n", " opt_outputs = session.run(None, opt_inputs)\n", " latency.append(time.time() - start)\n", "print(\"Average length\", statistics.mean(lengths))\n", "print(\"OnnxRuntime {} Inference time with actual sequence length = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's compare the output and see whether the results are close.\n", "\n", "**Note**: Need end-to-end evaluation on performance and accuracy if you use this strategy." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Comparing results with/without paddings *****\n", "Output 0 are close: True\n", "Output 1 are close: True\n" ] } ], "source": [ "print(\"***** Comparing results with/without paddings *****\")\n", "for i in range(2):\n", " print('Output {} are close:'.format(i), numpy.allclose(opt_outputs[i], ort_outputs[i][:,:len(opt_outputs[i][0])], rtol=1e-03, atol=1e-03))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Offline Optimization and Test Tools\n", "\n", "It is recommended to try [OnnxRuntime Transformer Model Optimization Tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) on the exported ONNX models. It could help verify whether the model can be fully optimized, and get performance test results." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Transformer Optimizer\n", "\n", "Although OnnxRuntime could optimize Bert model exported by PyTorch. Sometime, model cannot be fully optimized due to different reasons:\n", "* A new subgraph pattern is generated by new version of export tool, and the pattern is not covered by older version of OnnxRuntime. \n", "* The exported model uses dynamic axis and this makes it harder for shape inference of the graph. That blocks some optimization to be applied.\n", "* Some optimization is better to be done offline. Like change input tensor type from int64 to int32 to avoid extra Cast nodes, or convert model to float16 to achieve better performance in V100 or T4 GPU.\n", "\n", "We have python script **optimizer.py**, which is more flexible in graph pattern matching and model conversion (like float32 to float16). You can also use it to verify whether a Bert model is fully optimized.\n", "\n", "In this example, we can see that it introduces optimization that is not provided by onnxruntime: SkipLayerNormalization and bias fusion, which is not fused in OnnxRuntime due to shape inference as mentioned.\n", "\n", "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph.\n", "\n", "Example Usage:\n", "```\n", "from onnxruntime_tools import optimizer\n", "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n", "optimized_model.save_model_to_file(optimized_model_path)\n", "```\n", "\n", "You can also use optimizer_cli like the following:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Float32 Model\n", "Let us optimize the ONNX model using the script. The first example will output model with float32 to store weights. This is the choice for most GPUs without Tensor Core.\n", "\n", "If your GPU (like V100 or T4) has Tensor Core, jump to [Float16 Model](#6.-Model-Optimization-with-Float16) section since that will give you better performance than Float32 model." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n", "\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Optimized Graph\n", "We can open the optimized model using [Netron](https://github.com/lutzroeder/netron) to visualize.\n", "\n", "The graph is like the following:\n", "\n", "\n", "Sometime, optimized graph is slightly different. For example, FastGelu is replaced by BiasGelu for CPU inference; When the option --input_int32 is used, Cast nodes for inputs are removed." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import netron\n", "\n", "# change it to True if want to view the optimized model in browser\n", "enable_netron = False\n", "if enable_netron:\n", " # If you encounter error \"access a socket in a way forbidden by its access permissions\", install Netron as standalone application instead.\n", " netron.start(optimized_fp32_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Performance Test Tool\n", "\n", "The following will create 1000 random inputs of batch_size 1 and sequence length 128, then measure the average latency and throughput numbers.\n", "\n", "Note that the test uses fixed sequence length. If you use [dynamic sequence length](#Inference-with-Actual-Sequence-Length), actual performance depends on the distribution of sequence length.\n", "\n", "**Attention**: Latency numbers from Jupyter Notebook are not accurate. See [Attional Info](#7.-Additional-Info) for more info." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.92 ms, Throughput = 203.24 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.90 ms, Throughput = 203.88 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 5.07 ms, Throughput = 197.16 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.82 ms, Throughput = 207.33 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.93 ms, Throughput = 202.92 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.91 ms, Throughput = 203.55 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.88 ms, Throughput = 204.90 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's load the summary file and take a look. Note that blank value in OMP_NUM_THREADS or OMP_WAIT_POLICY means the environment variable does not exist." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
04.824.534.575.157.258.75207.33112ACTIVENoneTrue
14.884.544.586.477.138.68204.901212PASSIVENoneTrue
24.904.544.576.167.648.82203.88112PASSIVENoneTrue
34.914.554.596.707.438.78203.551212ACTIVENoneTrue
44.924.574.606.507.828.90203.240NoneTrue
54.934.554.596.667.578.80202.92121PASSIVENoneTrue
65.074.564.617.198.119.01197.16121ACTIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 4.82 4.53 4.57 5.15 7.25 \n", "1 4.88 4.54 4.58 6.47 7.13 \n", "2 4.90 4.54 4.57 6.16 7.64 \n", "3 4.91 4.55 4.59 6.70 7.43 \n", "4 4.92 4.57 4.60 6.50 7.82 \n", "5 4.93 4.55 4.59 6.66 7.57 \n", "6 5.07 4.56 4.61 7.19 8.11 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 8.75 207.33 1 12 \n", "1 8.68 204.90 12 12 \n", "2 8.82 203.88 1 12 \n", "3 8.78 203.55 12 12 \n", "4 8.90 203.24 0 \n", "5 8.80 202.92 12 1 \n", "6 9.01 197.16 12 1 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 PASSIVE None True \n", "2 PASSIVE None True \n", "3 ACTIVE None True \n", "4 None True \n", "5 PASSIVE None True \n", "6 ACTIVE None True " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From above result, we can see that latency is very close for different settings. The default setting (intra_op_num_threads=0, OMP_NUM_THREADS and OMP_WAIT_POLICY does not exist) performs the best. \n", "\n", "### Model Results Comparison Tool\n", "\n", "When a BERT model is optimized, some approximation is used in calculation. If your BERT model has three inputs, a script compare_bert_results.py can be used to do a quick verification. The tool will generate some fake input data, and compare the inference outputs of the original and optimized models. If outputs are all close, it is safe to use the optimized model.\n", "\n", "For GPU inference, the absolute or relative difference is larger than those numbers of CPU inference. Note that slight difference in output will not impact final result. We did end-to-end evaluation using SQuAD data set using a fine-tuned squad model, and F1 score is almost the same before/after optimization." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n", "maximum absolute difference=1.9222497940063477e-06\r\n", "maximum relative difference=0.05027933046221733\r\n" ] } ], "source": [ "!python -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Model Optimization with Float16\n", "\n", "The optimizer.py script have an option **--float16** to convert model to use float16 to store weights. After the conversion, it could be faster to run in GPU with tensor cores like V100 or T4.\n", "\n", "Let's run tools to measure the performance on V100. The results show significant performance improvement: latency is about 3.4 ms for float32 model, and 1.8 ms for float16 model." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.90 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.12 ms, Throughput = 320.00 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.02 ms, Throughput = 331.39 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 332.53 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 328.67 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.72 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 329.32 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
03.012.792.812.865.087.16332.53112ACTIVENoneTrue
13.012.802.812.884.527.05331.900NoneTrue
23.012.782.802.925.017.02331.721212ACTIVENoneTrue
33.022.792.802.856.347.04331.39121ACTIVENoneTrue
43.042.802.822.935.567.08329.321212PASSIVENoneTrue
53.042.792.812.926.377.08328.67121PASSIVENoneTrue
63.122.792.822.966.667.20320.00112PASSIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.01 2.79 2.81 2.86 5.08 \n", "1 3.01 2.80 2.81 2.88 4.52 \n", "2 3.01 2.78 2.80 2.92 5.01 \n", "3 3.02 2.79 2.80 2.85 6.34 \n", "4 3.04 2.80 2.82 2.93 5.56 \n", "5 3.04 2.79 2.81 2.92 6.37 \n", "6 3.12 2.79 2.82 2.96 6.66 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 7.16 332.53 1 12 \n", "1 7.05 331.90 0 \n", "2 7.02 331.72 12 12 \n", "3 7.04 331.39 12 1 \n", "4 7.08 329.32 12 12 \n", "5 7.08 328.67 12 1 \n", "6 7.20 320.00 1 12 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 None True \n", "2 ACTIVE None True \n", "3 ACTIVE None True \n", "4 PASSIVE None True \n", "5 PASSIVE None True \n", "6 PASSIVE None True " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Throughput Tuning\n", "\n", "Some application need best throughput under some constraint on latency. This can be done by testing performance of different batch sizes. The tool could help on this.\n", "\n", "Here is an example that check the performance of multiple batch sizes (1, 2, 4, 8, 16, 32 and 64) using default settings." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=32 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 16.17 ms, Throughput = 1979.41 QPS\n", "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.00 ms, Throughput = 333.83 QPS\n", "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=2 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.59 ms, Throughput = 557.32 QPS\n", "test setting TestSetting(batch_size=64, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=64 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=64,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 29.26 ms, Throughput = 2187.15 QPS\n", "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=4 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.32 ms, Throughput = 926.92 QPS\n", "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=8 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 6.32 ms, Throughput = 1266.63 QPS\n", "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=16 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 9.60 ms, Throughput = 1666.05 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $THREAD_SETTING $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)batch_size
03.002.792.812.864.377.08333.831
13.593.333.353.426.607.54557.322
24.323.984.014.647.238.11926.924
36.325.945.977.618.9610.121266.638
49.609.229.2511.3212.3313.341666.0516
516.1715.8015.9017.3818.8019.931979.4132
629.2628.8929.0130.6332.5333.282187.1564
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.00 2.79 2.81 2.86 4.37 \n", "1 3.59 3.33 3.35 3.42 6.60 \n", "2 4.32 3.98 4.01 4.64 7.23 \n", "3 6.32 5.94 5.97 7.61 8.96 \n", "4 9.60 9.22 9.25 11.32 12.33 \n", "5 16.17 15.80 15.90 17.38 18.80 \n", "6 29.26 28.89 29.01 30.63 32.53 \n", "\n", " Latency_P99 Throughput(QPS) batch_size \n", "0 7.08 333.83 1 \n", "1 7.54 557.32 2 \n", "2 8.11 926.92 4 \n", "3 10.12 1266.63 8 \n", "4 13.34 1666.05 16 \n", "5 19.93 1979.41 32 \n", "6 33.28 2187.15 64 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float16 model summary from\", latest_result_file)\n", "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'warmup', 'sequence_length']\n", "columns_to_remove.extend(['intra_op_num_threads', 'OMP_NUM_THREADS', 'OMP_WAIT_POLICY', 'contiguous'])\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Additional Info\n", "\n", "Note that running Jupyter Notebook has significant impact on performance result. You can close Jupyter Notebook and other applications, then run the performance test in a console to get more accurate performance numbers.\n", "\n", "We have a [benchmark script](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/run_benchmark.sh). It is recommended to use it measure inference speed of OnnxRuntime.\n", "\n", "[OnnxRuntime C API](https://github.com/microsoft/onnxruntime/blob/master/docs/C_API.md) could get slightly better performance than python API. If you use C API in inference, you can use OnnxRuntime_Perf_Test.exe built from source to measure performance instead.\n", "\n", "Here is the machine configuration that generated the above results. You might get slower or faster result according to your hardware." ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\r\n", " \"gpu\": {\r\n", " \"driver_version\": \"440.64.00\",\r\n", " \"devices\": [\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 14110883840,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " },\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 16932601856,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " }\r\n", " ]\r\n", " },\r\n", " \"cpu\": {\r\n", " \"brand\": \"Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\",\r\n", " \"cores\": 12,\r\n", " \"logical_cores\": 12,\r\n", " \"hz\": \"2.5940 GHz\",\r\n", " \"l2_cache\": \"256 KB\",\r\n", " \"l3_cache\": \"35840 KB\",\r\n", " \"processor\": \"x86_64\"\r\n", " },\r\n", " \"memory\": {\r\n", " \"total\": 236645588992,\r\n", " \"available\": 222567559168\r\n", " },\r\n", " \"python\": \"3.7.7.final.0 (64 bit)\",\r\n", " \"os\": \"Linux-4.15.0-1089-azure-x86_64-with-debian-stretch-sid\",\r\n", " \"onnxruntime\": {\r\n", " \"version\": \"1.3.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"pytorch\": {\r\n", " \"version\": \"1.5.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"tensorflow\": null\r\n", "}\r\n" ] } ], "source": [ "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent" ] } ], "metadata": { "kernelspec": { "display_name": "PyCharm (ccks_ner-master)", "language": "python", "name": "pycharm-de4c0941" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: code/bert-base-count3-len100/finetuning/Config.py ================================================ from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig, ElectraModel, ElectraConfig, ElectraTokenizer, \ RobertaTokenizer, RobertaModel, RobertaConfig from NEZHA.modeling_nezha import NeZhaModel from NEZHA.configuration_nezha import NeZhaConfig MODELS = { 'BertForClass': BertModel, 'BertForClass_MultiDropout': BertModel, 'BertLastTwoCls': BertModel, 'BertLastCls':BertModel, 'BertLastTwoClsPooler': BertModel, 'BertLastTwoEmbeddings': BertModel, 'BertLastTwoEmbeddingsPooler': BertModel, 'BertLastFourCls': BertModel, 'BertLastFourClsPooler': BertModel, 'BertLastFourEmbeddings': BertModel, 'BertLastFourEmbeddingsPooler': BertModel, 'BertDynCls': BertModel, 'BertDynEmbeddings': BertModel, 'BertRNN': BertModel, 'BertCNN': XLNetModel, 'BertRCNN': BertModel, 'XLNet': XLNetModel, 'Electra': ElectraModel, 'NEZHA': NeZhaModel } TOKENIZERS = { 'BertForClass': BertTokenizer, 'BertForClass_MultiDropout': BertTokenizer, 'BertLastTwoCls': BertTokenizer, 'BertLastCls': BertTokenizer, 'BertLastTwoClsPooler': BertTokenizer, 'BertLastTwoEmbeddings': BertTokenizer, 'BertLastTwoEmbeddingsPooler': BertTokenizer, 'BertLastFourCls': BertTokenizer, 'BertLastFourClsPooler': BertTokenizer, 'BertLastFourEmbeddings': BertTokenizer, 'BertLastFourEmbeddingsPooler': BertTokenizer, 'BertDynCls': BertTokenizer, 'BertDynEmbeddings': BertTokenizer, 'BertRNN': BertTokenizer, 'BertCNN': BertTokenizer, 'BertRCNN': BertTokenizer, 'XLNet': XLNetTokenizer, 'Electra': ElectraTokenizer, 'NEZHA': BertTokenizer } CONFIGS = { 'BertForClass': BertConfig, 'BertForClass_MultiDropout': BertConfig, 'BertLastTwoCls': BertConfig, 'BertLastCls': BertConfig, 'BertLastTwoClsPooler': BertConfig, 'BertLastTwoEmbeddings': BertConfig, 'BertLastTwoEmbeddingsPooler': BertConfig, 'BertLastFourCls': BertConfig, 'BertLastFourClsPooler': BertConfig, 'BertLastFourEmbeddings': BertConfig, 'BertLastFourEmbeddingsPooler': BertConfig, 'BertDynCls': BertConfig, 'BertDynEmbeddings': BertConfig, 'BertRNN': BertConfig, 'BertCNN': BertConfig, 'BertRCNN': BertConfig, 'XLNet': XLNetConfig, 'Electra': ElectraConfig, 'NEZHA': NeZhaConfig } ================================================ FILE: code/bert-base-count3-len100/finetuning/NEZHA/configuration_nezha.py ================================================ from transformers import PretrainedConfig NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class NeZhaConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. It is used to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALBERT `xxlarge `__ architecture. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30000): Vocabulary size of the ALBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of vocabulary embeddings. hidden_size (:obj:`int`, optional, defaults to 4096): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_hidden_groups (:obj:`int`, optional, defaults to 1): Number of groups for the hidden layers, parameters in the same group are shared. num_attention_heads (:obj:`int`, optional, defaults to 64): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 16384): The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. inner_group_num (:obj:`int`, optional, defaults to 1): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for attached classifiers. Example:: from transformers import AlbertConfig, AlbertModel # Initializing an ALBERT-xxlarge style configuration albert_xxlarge_configuration = AlbertConfig() # Initializing an ALBERT-base style configuration albert_base_configuration = AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) # Initializing a model from the ALBERT-base style configuration model = AlbertModel(albert_xxlarge_configuration) # Accessing the model configuration configuration = model.config Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "nezha" def __init__( self, vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, num_hidden_groups=1, num_attention_heads=64, intermediate_size=16384, inner_group_num=1, hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, max_relative_position=64, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, use_relative_position=True, pad_token_id=0, bos_token_id=2, eos_token_id=3, **kwargs ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_hidden_groups = num_hidden_groups self.num_attention_heads = num_attention_heads self.inner_group_num = inner_group_num self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.max_relative_position = max_relative_position self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_relative_position=use_relative_position self.classifier_dropout_prob = classifier_dropout_prob ================================================ FILE: code/bert-base-count3-len100/finetuning/NEZHA/modeling_nezha.py ================================================ import math import os import logging import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_nezha import NeZhaConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.modeling_utils import PreTrainedModel, prune_linear_layer from transformers.models.bert.modeling_bert import ( BertOutput, BertPooler, BertSelfOutput, BertIntermediate, BertOnlyMLMHead, BertOnlyNSPHead, BertPreTrainingHeads, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING, ) logger = logging.getLogger(__name__) _CONFIG_FOR_DOC = "NeZhaConfig" _TOKENIZER_FOR_DOC = "NeZhaTokenizer" NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [] NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {} def load_tf_weights_in_nezha(model, config, tf_checkpoint_path): """Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: # logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step", "good_steps", "loss_scale", 'bad_steps'] for n in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert ( pointer.shape == array.shape ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model class NeZhaEmbeddings(nn.Module): """ Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.use_relative_position = config.use_relative_position self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings def relative_position_encoding(depth, max_length=512, max_relative_position=127): vocab_size = max_relative_position * 2 + 1 range_vec = torch.arange(max_length) range_mat = range_vec.repeat(max_length).view(max_length, max_length) distance_mat = range_mat - torch.t(range_mat) distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) final_mat = distance_mat_clipped + max_relative_position embeddings_table = torch.zeros(vocab_size, depth) position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) embeddings_table[:, 0::2] = torch.sin(position * div_term) embeddings_table[:, 1::2] = torch.cos(position * div_term) embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1) flat_relative_positions_matrix = final_mat.view(-1) one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix, num_classes=vocab_size).float() positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) my_shape = list(final_mat.size()) my_shape.append(depth) positions_encoding = positions_encoding.view(my_shape) return positions_encoding class NeZhaSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings, depth=self.attention_head_size, max_relative_position=config.max_relative_position).to('cuda') def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. if encoder_hidden_states is not None: mixed_key_layer = self.key(encoder_hidden_states) mixed_value_layer = self.value(encoder_hidden_states) attention_mask = encoder_attention_mask else: mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] query_layer_t = query_layer.permute(2, 0, 1, 3) query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, self.attention_head_size) key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1)) key_position_scores_r = key_position_scores.view(from_seq_length, batch_size, num_attention_heads, from_seq_length) key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) attention_scores = attention_scores + key_position_scores_r_t attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] attention_probs_t = attention_probs.permute(2, 0, 1, 3) attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, to_seq_length) value_position_scores = torch.matmul(attentions_probs_r, relations_values) value_position_scores_r = value_position_scores.view(from_seq_length, batch_size, num_attention_heads, self.attention_head_size) value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) context_layer = context_layer + value_position_scores_r_t context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class NeZhaAttention(nn.Module): def __init__(self, config): super().__init__() self.self = NeZhaSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class NeZhaLayer(nn.Module): def __init__(self, config): super().__init__() self.attention = NeZhaAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = NeZhaAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights if self.is_decoder and encoder_hidden_states is not None: cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) outputs = (layer_output,) + outputs return outputs class NeZhaEncoder(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)]) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask ) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class NeZhaPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = NeZhaConfig pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_nezha base_model_prefix = "bert" def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class NeZhaModel(NeZhaPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration set to :obj:`True`; an :obj:`encoder_hidden_states` is expected as an input to the forward pass. .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762 """ def __init__(self, config): super().__init__(config) self.config = config self.embeddings = NeZhaEmbeddings(config) self.encoder = NeZhaEncoder(config) self.pooler = BertPooler(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, self.device ) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class NeZhaForPreTraining(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertPreTrainingHeads(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, next_sentence_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForPreTraining import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) # add hidden states and attention if they are here outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] if labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class NeZhaForMaskedLM(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): Next token prediction loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMaskedLM import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here # Although this may seem awkward, BertForMaskedLM supports two scenarios: # 1. If a tensor that contains the indices of masked labels is provided, # the cross-entropy is the MLM cross-entropy that measures the likelihood # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. masked_lm_labels = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # if model is does not use a causal mask then add a dummy token if self.config.is_decoder is False: assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" attention_mask = torch.cat( [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 ) dummy_token = torch.full( (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device ) input_ids = torch.cat([input_ids, dummy_token], dim=1) return {"input_ids": input_ids, "attention_mask": attention_mask} @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, next_sentence_label=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForNextSentencePrediction import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) seq_relationship_scores = outputs[0] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if next_sentence_label is not None: loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) outputs = (next_sentence_loss,) + outputs return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class NeZhaForSequenceClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class NeZhaForMultipleChoice(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMultipleChoice import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class NeZhaForTokenClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForTokenClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class NeZhaForQuestionAnswering(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, position_ids=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForQuestionAnswering import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) assert answer == "a nice puppet" """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count3-len100/finetuning/model.py ================================================ import torch import random import os from torch import nn, optim import torch.nn.functional as F from transformers.activations import get_activation from Config import * class BertForClass(nn.Module): def __init__(self, config): super(BertForClass, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit class BertForClass_MultiDropout(nn.Module): def __init__(self, config): super(BertForClass_MultiDropout, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.multi_drop = 5 self.multi_dropouts = nn.ModuleList([nn.Dropout(config.dropout) for _ in range(self.multi_drop)]) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) for j, dropout in enumerate(self.multi_dropouts): if j == 0: logit = self.classifier(dropout(concat_out)) / self.multi_drop else: logit += self.classifier(dropout(concat_out)) / self.multi_drop return logit class BertLastTwoCls(nn.Module): def __init__(self, config): super(BertLastTwoCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) logit = self.classifier(pooler_output) return logit class BertLastCls(nn.Module): def __init__(self, config): super(BertLastCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] if self.isDropout: output = self.dropout(pooler_output) logit = self.classifier(output) return logit class BertLastTwoClsPooler(nn.Module): def __init__(self, config): super(BertLastTwoClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddings(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourCls(nn.Module): def __init__(self, config): super(BertLastFourCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] output = torch.cat( (hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourClsPooler(nn.Module): def __init__(self, config): super(BertLastFourClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddings(nn.Module): def __init__(self, config): super(BertLastFourEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastFourEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertDynCls(nn.Module): def __init__(self, config): super(BertDynCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = hidden_states[-(i + 1)][0] weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertDynEmbeddings(nn.Module): def __init__(self, config): super(BertDynEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = torch.mean(hidden_states[-(i + 1)], dim=1) weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertRNN(nn.Module): def __init__(self, config): super(BertRNN, self).__init__() self.rnn_type = "gru" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.1 self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.num_directions = 1 if not self.bidirectional else 2 if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) self.dropout = nn.Dropout(self.drop_out) self.fc_rnn = nn.Linear(self.hidden_dim * self.num_directions, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(sequence_output) else: output, (hidden, cell) = self.rnn(sequence_output) # output = [ batch size, sent len, hidden_dim * bidirectional] batch_size, max_seq_len, hidden_dim = output.shape hidden = torch.transpose(hidden, 1, 0) hidden = torch.mean(torch.reshape(hidden, [batch_size, -1, hidden_dim]), dim=1) output = torch.sum(output, dim=1) fc_input = self.dropout(output + hidden) # output = torch.mean(output, dim=1) # fc_input = self.dropout(output) out = self.fc_rnn(fc_input) return out class BertCNN(nn.Module): def __init__(self, config): super(BertCNN, self).__init__() self.num_filters = 100 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.hidden_size = self.bert_config.to_dict()['hidden_size'] self.filter_sizes = {3, 4, 5} self.drop_out = 0.5 self.convs = nn.ModuleList( [nn.Conv2d(1, self.num_filters, (k, self.hidden_size)) for k in self.filter_sizes]) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dropout = nn.Dropout(self.drop_out) self.fc_cnn = nn.Linear(self.num_filters * len(self.filter_sizes), config.num_class) def conv_and_pool(self, x, conv): x = F.relu(conv(x)).squeeze(3) x = F.max_pool1d(x, x.size(2)).squeeze(2) return x def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = self.dropout(sequence_output) out = sequence_output.unsqueeze(1) out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) out = self.dropout(out) out = self.fc_cnn(out) return out class BertRCNN(nn.Module): def __init__(self, config): super(BertRCNN, self).__init__() self.rnn_type = "lstm" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.5 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) # self.maxpool = nn.MaxPool1d() self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.fc = nn.Linear(self.hidden_dim * self.n_layers, config.num_class) self.dropout = nn.Dropout(self.drop_out) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sentence_len = sequence_output.shape[1] pooler_output = pooler_output.unsqueeze(dim=1).repeat(1, sentence_len, 1) bert_sentence = sequence_output + pooler_output self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(bert_sentence) else: output, (hidden, cell) = self.rnn(bert_sentence) batch_size, max_seq_len, hidden_dim = output.shape out = torch.transpose(output.relu(), 1, 2) out = F.max_pool1d(out, max_seq_len).squeeze() out = self.fc(out) return out class XLNet(nn.Module): def __init__(self, config): super(XLNet, self).__init__() self.xlnet = XLNetModel.from_pretrained(config.model_path) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.fc = nn.Linear(self.xlnet.d_model, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output = self.xlnet(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = torch.sum(sequence_output[0], dim=1) if self.isDropout: sequence_output = self.dropout(sequence_output) out = self.fc(sequence_output) return out class ElectraClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x class Electra(nn.Module): def __init__(self, config): super(Electra, self).__init__() self.electra = ElectraModel.from_pretrained(config.model_path) config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.electra_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) self.electra_config.num_labels = config.num_class self.fc = ElectraClassificationHead(self.electra_config) def forward(self, input_ids, input_masks, segment_ids): discriminator_hidden_states = self.electra(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = discriminator_hidden_states[0] out = self.fc(sequence_output) return out class NEZHA(nn.Module): def __init__(self, config): super(NEZHA, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) #self.bert_model = MODELS[config.model](config=self.bert_config) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) # NEZHA init #torch_init_model(self.bert_model, os.path.join(config.model_path, 'pytorch_model.bin')) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit ================================================ FILE: code/bert-base-count3-len100/finetuning/models/gitkeep ================================================ ================================================ FILE: code/bert-base-count3-len100/finetuning/multi_gpu_QA.py ================================================ from tqdm import tqdm, trange import numpy as np import pandas as pd import logging import torch import random import os from torch import nn, optim from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig from transformers.optimization import get_linear_schedule_with_warmup from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score from model import * from utils import * import time import logging logging.basicConfig(level=logging.DEBUG, filename="train.log",filemode='a') from NEZHA.modeling_nezha import * MODEL_CLASSES = { 'BertForClass': BertForClass, 'BertLastCls': BertLastCls, 'BertLastTwoCls': BertLastTwoCls, 'BertLastTwoClsPooler': BertLastTwoClsPooler, 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 'BertLastFourCls': BertLastFourCls, 'BertLastFourClsPooler': BertLastFourClsPooler, 'BertLastFourEmbeddings': BertLastFourEmbeddings, 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 'BertDynCls': BertDynCls, 'BertDynEmbeddings': BertDynEmbeddings, 'BertRNN': BertRNN, 'BertCNN': BertCNN, 'BertRCNN': BertRCNN, 'XLNet': XLNet, 'Electra': Electra, 'NEZHA': NEZHA, } class Config: def __init__(self): # 预训练模型路径 self.modelId = 2 self.model = "BertLastCls" self.Stratification = False self.model_path = '../../bert-base-count3/pretrain/bert_model/' self.num_class = 2 self.dropout = 0.2 self.MAX_LEN = 100 self.epoch = 3 self.learn_rate = 4e-5 self.normal_lr = 1e-4 self.batch_size = 32 self.k_fold = 10 self.seed = 42 self.device = torch.device('cuda') # self.device = torch.device('cpu') self.focalloss = False self.pgd = False self.fgm = True config = Config() os.environ['PYTHONHASHSEED']='0'#消除hash算法的随机性 random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) file_path = './log/' # 创建一个logger logger = logging.getLogger('mylogger') logger.setLevel(logging.DEBUG) train = pd.read_csv('/tcdata/gaiic_track3_round1_train_20210228.tsv',sep='\t',header=None) semi = pd.read_csv('/tcdata/gaiic_track3_round2_train_20210407.tsv',sep='\t',header=None) train = pd.concat([train, semi], sort=False) train.columns=['q1','q2','label'] train_query1 = train['q1'].values.astype(str) train_query2 = train['q2'].values.astype(str) train_label = train['label'].values.astype(int) oof_train = np.zeros((len(train), config.num_class), dtype=np.float32) #kf = StratifiedKFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) kf = KFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) for fold, (train_index, valid_index) in enumerate(kf.split(train_query1, train_label)): print('\n\n------------fold:{}------------\n'.format(fold)) ''' q1 = train_query1[train_index] q2 = train_query2[train_index] y = train_label[train_index] ''' q1 = train_query1 q2 = train_query2 y = train_label val_q1 = train_query1[valid_index] val_q2 = train_query2[valid_index] val_y = train_label[valid_index] train_D = data_generator([q1, q2, y], config, shuffle=True) val_D = data_generator([val_q1, val_q2, val_y], config) model = MODEL_CLASSES[config.model](config).to(config.device) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) if config.pgd: pgd = PGD(model) K = 3 elif config.fgm: fgm = FGM(model) if config.focalloss: loss_fn = FocalLoss(config.num_class) else: loss_fn = nn.CrossEntropyLoss() # BCEWithLogitsLoss就是把Sigmoid-BCELoss合成一步 num_train_steps = int(len(train) / config.batch_size * config.epoch) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if config.Stratification: bert_params = [x for x in param_optimizer if 'bert' in x[0]] normal_params = [p for n, p in param_optimizer if 'bert' not in n] optimizer_parameters = [ {'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': normal_params, 'lr': config.normal_lr}, ] else: optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr=config.learn_rate) # lr为全局学习率 scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(len(train) / config.batch_size / 2), num_training_steps=num_train_steps ) best_auc = 0 PATH = './models/bert_{}.pth'.format(fold) save_model_path = './models/' if not os.path.exists(save_model_path): os.makedirs(save_model_path) for e in range(config.epoch): print('\n------------epoch:{}------------'.format(e)) model.train() acc = 0 train_len = 0 loss_num = 0 tq = tqdm(train_D,ncols=70,disable=True) last=time.time() for input_ids, input_masks, segment_ids, labels in tq: label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) loss = loss_fn(y_pred, label_t) loss = loss.mean() loss.backward() if config.pgd: pgd.backup_grad() # 对抗训练 for t in range(K): pgd.attack(is_first_attack=(t == 0)) # 在embedding上添加对抗扰动, first attack时备份param.data if t != K - 1: model.zero_grad() else: pgd.restore_grad() y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 elif config.fgm: # 对抗训练 fgm.attack() # 在embedding上添加对抗扰动 y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 # 梯度下降,更新参数 optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() y_pred = np.argmax(y_pred.detach().to("cpu").numpy(), axis=1) acc += sum(y_pred == labels) loss_num += loss.item() train_len += len(labels) tq.set_postfix(fold=fold, epoch=e, loss=loss_num / train_len, acc=acc / train_len) print(f"微调第{e}轮耗时:{time.time()-last}") model.eval() with torch.no_grad(): y_p = [] y_l = [] train_logit = None for input_ids, input_masks, segment_ids, labels in tqdm(val_D,disable=True): label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) y_pred = F.softmax(y_pred) y_pred = y_pred.detach().to("cpu").numpy() if train_logit is None: train_logit = y_pred else: train_logit = np.vstack((train_logit, y_pred)) y_p += list(y_pred[:,1]) y_pred = np.argmax(y_pred, axis=1) y_l += list(y_pred) f1 = f1_score(val_y, y_l, average="macro") auc_score = roc_auc_score(val_y, y_p) print("best_auc:{} auc_score:{} f1:{}\n".format(best_auc, auc_score, f1)) if auc_score >= best_auc: best_auc = auc_score oof_train[valid_index] = np.array(train_logit) #torch.save(model.module.state_dict() if hasattr(model, "module") else model.state_dict(), PATH) torch.save(model.module if hasattr(model, "module") else model, PATH) optimizer.zero_grad() del model torch.cuda.empty_cache() break ================================================ FILE: code/bert-base-count3-len100/finetuning/utils.py ================================================ import torch from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig import numpy as np import os import random from Config import * import torch import torch.nn as nn import torch.nn.functional as F def paddingList(ls:list,val,returnTensor=False): ls=ls[:]#不要改变了原list尺寸 maxLen=max([len(i) for i in ls]) for i in range(len(ls)): ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) return torch.tensor(ls,device='cuda') if returnTensor else ls def fastTokenizer(a:str,b:str,maxLen,tk): a,b=a.split(),b.split() a,b=tk.convert_tokens_to_ids(a),tk.convert_tokens_to_ids(b) maxLen-=3#空留给cls sep sep assert maxLen>=0 len2=maxLen//2#若为奇数,更长部分给左边 len1=maxLen-len2 #一共就a超长与否,b超长与否,组合的四种情况 if len(a)+len(b)>maxLen:#需要截断 if len(a)<=len1 and len(b)>len2: b=b[:maxLen-len(a)] elif len(a)>len1 and len(b)<=len2: a=a[:maxLen-len(b)] elif len(a)>len1 and len(b)>len2: a=a[:len1] b=b[:len2] input_ids=[tk.cls_token_id]+a+[tk.sep_token_id]+b+[tk.sep_token_id] token_type_ids=[0]*(len(a)+2)+[1]*(len(b)+1) return {'input_ids': input_ids, 'token_type_ids': token_type_ids} class data_generator: def __init__(self, data, config, shuffle=False): self.data = data self.batch_size = config.batch_size self.max_length = config.MAX_LEN self.shuffle = shuffle vocab = 'vocab.txt' if os.path.exists(config.model_path + 'vocab.txt') else 'spiece.model' self.tokenizer = TOKENIZERS[config.model].from_pretrained(config.model_path + vocab) self.steps = len(self.data[0]) // self.batch_size if len(self.data[0]) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self): q1, q2, y = self.data idxs = list(range(len(self.data[0]))) if self.shuffle: np.random.shuffle(idxs) input_ids, input_masks, segment_ids, labels = [], [], [], [] for index, i in enumerate(idxs): text = q1[i] text_pair = q2[i] ''' # text = self.tokenizer(text, text_pair, padding='max_length', truncation=True, max_length=self.max_length) text = fastTokenizer(text, text_pair, self.max_length, self.tokenizer) input_ids.append(text['input_ids']) segment_ids.append(text['token_type_ids']) input_masks.append([1] * len(text['input_ids'])) # bs为1时无padding,全1 yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] ''' tkRes = self.tokenizer(text, text_pair, max_length=self.max_length, truncation='longest_first', return_attention_mask=False) input_id = tkRes['input_ids'] segment_id = tkRes['token_type_ids'] assert len(segment_id) == len(input_id) input_ids.append(input_id) segment_ids.append(segment_id) labels.append(y[i]) if len(input_ids) == self.batch_size or i == idxs[-1]: input_ids = paddingList(input_ids, 0, returnTensor=True) # 动态padding segment_ids = paddingList(segment_ids, 0, returnTensor=True) input_masks = (input_ids != 0) yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] class PGD(): def __init__(self, model): self.model = model self.emb_backup = {} self.grad_backup = {} def attack(self, epsilon=0.3, alpha=0.1, emb_name='word_embeddings', is_first_attack=False): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: if is_first_attack: self.emb_backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0 and not torch.isnan(norm): r_at = alpha * param.grad / norm param.data.add_(r_at) param.data = self.project(name, param.data, epsilon) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.emb_backup param.data = self.emb_backup[name] self.emb_backup = {} def project(self, param_name, param_data, epsilon): r = param_data - self.emb_backup[param_name] if torch.norm(r) > epsilon: r = epsilon * r / torch.norm(r) return self.emb_backup[param_name] + r def backup_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: self.grad_backup[name] = param.grad.clone() def restore_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: param.grad = self.grad_backup[name] class FGM(): def __init__(self, model): self.model = model self.backup = {} def attack(self, epsilon=0.25, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: self.backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0: r_at = epsilon * param.grad / norm param.data.add_(r_at) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.backup param.data = self.backup[name] self.backup = {} # 支持多分类和二分类 class FocalLoss(nn.Module): """ This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' Focal_Loss= -1*alpha*(1-pt)^gamma*log(pt) :param num_class: :param alpha: (tensor) 3D or 4D the scalar factor for this criterion :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more focus on hard misclassified example :param smooth: (float,double) smooth value when cross entropy :param balance_index: (int) balance class index, should be specific when alpha is float :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch. """ def __init__(self, num_class, alpha=None, gamma=2, smooth=None, size_average=True): super(FocalLoss, self).__init__() self.num_class = num_class self.alpha = alpha self.gamma = gamma self.smooth = smooth self.size_average = size_average if self.alpha is None: self.alpha = torch.ones(self.num_class, 1) elif isinstance(self.alpha, (list, np.ndarray)): assert len(self.alpha) == self.num_class self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1) self.alpha = self.alpha / self.alpha.sum() else: raise TypeError('Not support alpha type') if self.smooth is not None: if self.smooth < 0 or self.smooth > 1.0: raise ValueError('smooth value should be in [0,1]') def forward(self, input, target): logit = F.softmax(input, dim=1) if logit.dim() > 2: # N,C,d1,d2 -> N,C,m (m=d1*d2*...) logit = logit.view(logit.size(0), logit.size(1), -1) logit = logit.permute(0, 2, 1).contiguous() logit = logit.view(-1, logit.size(-1)) target = target.view(-1, 1) # N = input.size(0) # alpha = torch.ones(N, self.num_class) # alpha = alpha * (1 - self.alpha) # alpha = alpha.scatter_(1, target.long(), self.alpha) epsilon = 1e-10 alpha = self.alpha if alpha.device != input.device: alpha = alpha.to(input.device) idx = target.cpu().long() one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() one_hot_key = one_hot_key.scatter_(1, idx, 1) if one_hot_key.device != logit.device: one_hot_key = one_hot_key.to(logit.device) if self.smooth: one_hot_key = torch.clamp( one_hot_key, self.smooth, 1.0 - self.smooth) pt = (one_hot_key * logit).sum(1) + epsilon logpt = pt.log() gamma = self.gamma alpha = alpha[idx] loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt if self.size_average: loss = loss.mean() else: loss = loss.sum() return loss def f1_match(y_true,y_pred): acc = sum(y_pred & y_true) / (sum(y_pred)) rec = sum(y_pred & y_true) / (sum(y_true)) return 2 * acc * rec /(acc + rec) ================================================ FILE: code/bert-base-count5/finetuning/.ipynb_checkpoints/PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved. \n", "Licensed under the MIT License." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inference PyTorch Bert Model with ONNX Runtime on GPU" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutorial, you'll learn how to load a Bert model from PyTorch, convert it to ONNX, and inference it for high performance using ONNX Runtime and NVIDIA GPU. In the following sections, we are going to use the Bert model trained with Stanford Question Answering Dataset (SQuAD) dataset as an example. Bert SQuAD model is used in question answering scenarios, where the answer to every question is a segment of text from the corresponding reading passage, or the question might be unanswerable.\n", "\n", "This notebook is for GPU inference. For CPU inference, please look at another notebook [Inference PyTorch Bert Model with ONNX Runtime on CPU](PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Prerequisites ##\n", "It requires your machine to have a GPU, and a python environment with [PyTorch](https://pytorch.org/) installed before running this notebook.\n", "\n", "#### GPU Environment Setup using AnaConda\n", "\n", "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n", "\n", "```console\n", "conda create -n gpu_env python=3.7\n", "conda activate gpu_env\n", "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n", "conda install -c anaconda ipykernel\n", "conda install -c conda-forge ipywidgets\n", "python -m ipykernel install --user --name=gpu_env_py37\n", "jupyter notebook\n", "```\n", "Finally, launch Jupyter Notebook and you can choose gpu_env_py37 as kernel to run this notebook.\n", "\n", "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the corresponding version in [requirements](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements). If the version is different from above cudatoolkit version, you have to install them separately, and add their bin directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Skipping onnxruntime-gpu as it is not installed.\u001b[0m\r\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip uninstall --quiet --yes onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet --upgrade transformers\n", "!{sys.executable} -m pip install --quiet --upgrade onnxconverter_common\n", "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n", "!{sys.executable} -m pip install --quiet wget netron pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Load Pretrained Bert model ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We begin by downloading the SQuAD data file and store them in the specified location. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "cache_dir = \"./squad\"\n", "if not os.path.exists(cache_dir):\n", " os.makedirs(cache_dir)\n", "\n", "predict_file_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json\"\n", "predict_file = os.path.join(cache_dir, \"dev-v1.1.json\")\n", "if not os.path.exists(predict_file):\n", " import wget\n", " print(\"Start downloading predict file.\")\n", " wget.download(predict_file_url, predict_file)\n", " print(\"Predict file downloaded.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's first define some constant variables." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Whether allow overwriting existing ONNX model and download the latest script from GitHub\n", "enable_overwrite = True\n", "\n", "# Total samples to inference, so that we can get average latency\n", "total_samples = 1000\n", "\n", "# ONNX opset version\n", "opset_version=11" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Specify some model configuration variables." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# For fine-tuned large model, the model name is \"bert-large-uncased-whole-word-masking-finetuned-squad\". Here we use bert-base for demo.\n", "model_name_or_path = \"bert-base-cased\"\n", "max_seq_length = 128\n", "doc_stride = 128\n", "max_query_length = 64" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Start to load model from pretrained. This step could take a few minutes. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 48/48 [00:04<00:00, 11.28it/s]\n", "convert squad examples to features: 100%|██████████| 1000/1000 [00:09<00:00, 102.15it/s]\n", "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 161306.98it/s]\n" ] } ], "source": [ "# The following code is adapted from HuggingFace transformers\n", "# https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n", "\n", "from transformers import (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "\n", "# Load pretrained model and tokenizer\n", "config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n", "tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n", "model = model_class.from_pretrained(model_name_or_path,\n", " from_tf=False,\n", " config=config,\n", " cache_dir=cache_dir)\n", "# load some examples\n", "from transformers.data.processors.squad import SquadV1Processor\n", "\n", "processor = SquadV1Processor()\n", "examples = processor.get_dev_examples(None, filename=predict_file)\n", "\n", "from transformers import squad_convert_examples_to_features\n", "features, dataset = squad_convert_examples_to_features( \n", " examples=examples[:total_samples], # convert enough examples for this notebook\n", " tokenizer=tokenizer,\n", " max_seq_length=max_seq_length,\n", " doc_stride=doc_stride,\n", " max_query_length=max_query_length,\n", " is_training=False,\n", " return_dataset='pt'\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Export the loaded model ##\n", "Once the model is loaded, we can export the loaded PyTorch model to ONNX." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model exported at ./onnx/bert-base-cased-squad_opset11.onnx\n" ] } ], "source": [ "output_dir = \"./onnx\"\n", "if not os.path.exists(output_dir):\n", " os.makedirs(output_dir) \n", "export_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opset{}.onnx'.format(opset_version))\n", "\n", "import torch\n", "use_gpu = torch.cuda.is_available()\n", "device = torch.device(\"cuda\" if use_gpu else \"cpu\")\n", "\n", "# Get the first example data to run the model and export it to ONNX\n", "data = dataset[0]\n", "inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", "}\n", "\n", "# Set model to inference mode, which is required before exporting the model because some operators behave differently in \n", "# inference and training mode.\n", "model.eval()\n", "model.to(device)\n", "\n", "if enable_overwrite or not os.path.exists(export_model_path):\n", " with torch.no_grad():\n", " symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}\n", " torch.onnx.export(model, # model being run\n", " args=tuple(inputs.values()), # model input (or a tuple for multiple inputs)\n", " f=export_model_path, # where to save the model (can be a file or file-like object)\n", " opset_version=opset_version, # the ONNX version to export the model to\n", " do_constant_folding=True, # whether to execute constant folding for optimization\n", " input_names=['input_ids', # the model's input names\n", " 'input_mask', \n", " 'segment_ids'],\n", " output_names=['start', 'end'], # the model's output names\n", " dynamic_axes={'input_ids': symbolic_names, # variable length axes\n", " 'input_mask' : symbolic_names,\n", " 'segment_ids' : symbolic_names,\n", " 'start' : symbolic_names,\n", " 'end' : symbolic_names})\n", " print(\"Model exported at \", export_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. PyTorch Inference ##\n", "Use PyTorch to evaluate an example input for comparison purpose." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PyTorch cuda Inference time = 16.57 ms\n" ] } ], "source": [ "import time\n", "\n", "# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.\n", "latency = []\n", "with torch.no_grad():\n", " for i in range(total_samples):\n", " data = dataset[i]\n", " inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", " }\n", " start = time.time()\n", " outputs = model(**inputs)\n", " latency.append(time.time() - start)\n", "print(\"PyTorch {} Inference time = {} ms\".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Inference ONNX Model with ONNX Runtime ##\n", "\n", "### CUDA and cuDNN Path\n", "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn):\n", "\n", "* [onnxruntime-gpu v1.3.0](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "* [onnxruntime-gpu v1.2.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.2.0) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "\n", "During installing PyTorch 1.5, we installed cudatoolkit 10.1.243 in this conda environment. That shall be good for onnxruntime-gpu 1.3.0 in Jupyter Notebook." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n", "add_cuda_path = False\n", "\n", "if add_cuda_path:\n", " # Add path of CUDA 10.0 and CUDNN 7.6 for onnxruntime-gpu 1.0.0 ~ 1.1.2\n", " cuda_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " cudnn_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n", " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n", " else:\n", " if cuda_dir == cudnn_dir:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n", " else:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### OpenMP Environment Variable\n", "\n", "OpenMP environment variables are optional for GPU inference of standard Bert model. It has little performance impact on Bert model since most nodes are executed in GPU. \n", "\n", "You can find the best setting based on [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n", "\n", "**Attention: Setting environment variables shall be done before importing onnxruntime**. Otherwise, they might not take effect." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Optional. You can change them according to Performance Test Tool result.\n", "#os.environ[\"OMP_NUM_THREADS\"] = '1'\n", "#os.environ[\"OMP_WAIT_POLICY\"] = 'PASSIVE'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are ready to inference the model with ONNX Runtime." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OnnxRuntime gpu Inference time = 4.43 ms\n" ] } ], "source": [ "import psutil\n", "import onnxruntime\n", "import numpy\n", "\n", "assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()\n", "device_name = 'gpu'\n", "\n", "sess_options = onnxruntime.SessionOptions()\n", "\n", "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n", "# Note that this will increase session creation time so enable it for debugging only.\n", "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_{}.onnx\".format(device_name))\n", "\n", "# Please change the value according to best setting in Performance Test Tool result.\n", "sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n", "\n", "session = onnxruntime.InferenceSession(export_model_path, sess_options)\n", "\n", "latency = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n", " ort_inputs = {\n", " 'input_ids': data[0].cpu().reshape(1, max_seq_length).numpy(),\n", " 'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n", " 'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()\n", " }\n", " start = time.time()\n", " ort_outputs = session.run(None, ort_inputs)\n", " latency.append(time.time() - start)\n", " \n", "print(\"OnnxRuntime {} Inference time = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can compare the output of PyTorch and ONNX Runtime. We can see some results are not close. It is because ONNX Runtime uses some approximation in CUDA optimization. Based on our evaluation on SQuAD data set, F1 score is on par for models before and after optimization." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Verifying correctness *****\n", "PyTorch and ONNX Runtime output 0 are close: True\n", "maximum_diff=9.499490261077881e-07 average_diff=1.4225952327251434e-07\n", "PyTorch and ONNX Runtime output 1 are close: True\n", "maximum_diff=6.92903995513916e-07 average_diff=1.2441887520253658e-07\n" ] } ], "source": [ "print(\"***** Verifying correctness *****\")\n", "for i in range(2): \n", " print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[i], outputs[i].cpu(), rtol=1e-02, atol=1e-02))\n", " diff = ort_outputs[i] - outputs[i].cpu().numpy()\n", " max_diff = numpy.max(numpy.abs(diff))\n", " avg_diff = numpy.average(numpy.abs(diff))\n", " print(f'maximum_diff={max_diff} average_diff={avg_diff}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inference with Actual Sequence Length\n", "Note that ONNX model is exported using dynamic length axis. It is recommended to use actual sequence input without padding instead of fixed length input for best performance. Let's see how it can be applied to this model.\n", "\n", "From an example input below, we can see zero padding at the end of each sequence." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[ 101, 1293, 1242, 2557, 1127, 1226, 1104, 1103, 3613, 16429,\n", " 5235, 136, 102, 3613, 16429, 5988, 170, 107, 1353, 1671,\n", " 1992, 1342, 107, 5235, 117, 1107, 1134, 1473, 3683, 3538,\n", " 1125, 170, 1476, 118, 1248, 2595, 4086, 1714, 1104, 2965,\n", " 15897, 1104, 3613, 16429, 119, 1473, 3683, 3538, 3222, 1149,\n", " 2551, 1168, 23759, 1116, 1121, 1506, 1103, 10280, 2231, 1111,\n", " 1103, 1714, 16355, 119, 102, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]],\n", " device='cuda:0'),\n", " 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n", " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# An example input (we can see padding). From attention_mask, we can deduce the actual length.\n", "inputs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The original sequence length is 128. After removing paddings, the sequence length is reduced. Input with smaller sequence length need less computation, thus we can see there is improvement on inference latency. " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average length 101\n", "OnnxRuntime gpu Inference time with actual sequence length = 4.23 ms\n" ] } ], "source": [ "import statistics\n", "\n", "latency = []\n", "lengths = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # Instead of using fixed length (128), we can use actual sequence length (less than 128), which helps to get better performance.\n", " actual_sequence_length = sum(data[1].numpy())\n", " lengths.append(actual_sequence_length)\n", " opt_inputs = {\n", " 'input_ids': data[0].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'input_mask': data[1].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'segment_ids': data[2].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length)\n", " }\n", " start = time.time()\n", " opt_outputs = session.run(None, opt_inputs)\n", " latency.append(time.time() - start)\n", "print(\"Average length\", statistics.mean(lengths))\n", "print(\"OnnxRuntime {} Inference time with actual sequence length = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's compare the output and see whether the results are close.\n", "\n", "**Note**: Need end-to-end evaluation on performance and accuracy if you use this strategy." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Comparing results with/without paddings *****\n", "Output 0 are close: True\n", "Output 1 are close: True\n" ] } ], "source": [ "print(\"***** Comparing results with/without paddings *****\")\n", "for i in range(2):\n", " print('Output {} are close:'.format(i), numpy.allclose(opt_outputs[i], ort_outputs[i][:,:len(opt_outputs[i][0])], rtol=1e-03, atol=1e-03))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Offline Optimization and Test Tools\n", "\n", "It is recommended to try [OnnxRuntime Transformer Model Optimization Tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) on the exported ONNX models. It could help verify whether the model can be fully optimized, and get performance test results." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Transformer Optimizer\n", "\n", "Although OnnxRuntime could optimize Bert model exported by PyTorch. Sometime, model cannot be fully optimized due to different reasons:\n", "* A new subgraph pattern is generated by new version of export tool, and the pattern is not covered by older version of OnnxRuntime. \n", "* The exported model uses dynamic axis and this makes it harder for shape inference of the graph. That blocks some optimization to be applied.\n", "* Some optimization is better to be done offline. Like change input tensor type from int64 to int32 to avoid extra Cast nodes, or convert model to float16 to achieve better performance in V100 or T4 GPU.\n", "\n", "We have python script **optimizer.py**, which is more flexible in graph pattern matching and model conversion (like float32 to float16). You can also use it to verify whether a Bert model is fully optimized.\n", "\n", "In this example, we can see that it introduces optimization that is not provided by onnxruntime: SkipLayerNormalization and bias fusion, which is not fused in OnnxRuntime due to shape inference as mentioned.\n", "\n", "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph.\n", "\n", "Example Usage:\n", "```\n", "from onnxruntime_tools import optimizer\n", "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n", "optimized_model.save_model_to_file(optimized_model_path)\n", "```\n", "\n", "You can also use optimizer_cli like the following:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Float32 Model\n", "Let us optimize the ONNX model using the script. The first example will output model with float32 to store weights. This is the choice for most GPUs without Tensor Core.\n", "\n", "If your GPU (like V100 or T4) has Tensor Core, jump to [Float16 Model](#6.-Model-Optimization-with-Float16) section since that will give you better performance than Float32 model." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n", "\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Optimized Graph\n", "We can open the optimized model using [Netron](https://github.com/lutzroeder/netron) to visualize.\n", "\n", "The graph is like the following:\n", "\n", "\n", "Sometime, optimized graph is slightly different. For example, FastGelu is replaced by BiasGelu for CPU inference; When the option --input_int32 is used, Cast nodes for inputs are removed." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import netron\n", "\n", "# change it to True if want to view the optimized model in browser\n", "enable_netron = False\n", "if enable_netron:\n", " # If you encounter error \"access a socket in a way forbidden by its access permissions\", install Netron as standalone application instead.\n", " netron.start(optimized_fp32_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Performance Test Tool\n", "\n", "The following will create 1000 random inputs of batch_size 1 and sequence length 128, then measure the average latency and throughput numbers.\n", "\n", "Note that the test uses fixed sequence length. If you use [dynamic sequence length](#Inference-with-Actual-Sequence-Length), actual performance depends on the distribution of sequence length.\n", "\n", "**Attention**: Latency numbers from Jupyter Notebook are not accurate. See [Attional Info](#7.-Additional-Info) for more info." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.92 ms, Throughput = 203.24 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.90 ms, Throughput = 203.88 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 5.07 ms, Throughput = 197.16 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.82 ms, Throughput = 207.33 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.93 ms, Throughput = 202.92 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.91 ms, Throughput = 203.55 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.88 ms, Throughput = 204.90 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's load the summary file and take a look. Note that blank value in OMP_NUM_THREADS or OMP_WAIT_POLICY means the environment variable does not exist." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
04.824.534.575.157.258.75207.33112ACTIVENoneTrue
14.884.544.586.477.138.68204.901212PASSIVENoneTrue
24.904.544.576.167.648.82203.88112PASSIVENoneTrue
34.914.554.596.707.438.78203.551212ACTIVENoneTrue
44.924.574.606.507.828.90203.240NoneTrue
54.934.554.596.667.578.80202.92121PASSIVENoneTrue
65.074.564.617.198.119.01197.16121ACTIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 4.82 4.53 4.57 5.15 7.25 \n", "1 4.88 4.54 4.58 6.47 7.13 \n", "2 4.90 4.54 4.57 6.16 7.64 \n", "3 4.91 4.55 4.59 6.70 7.43 \n", "4 4.92 4.57 4.60 6.50 7.82 \n", "5 4.93 4.55 4.59 6.66 7.57 \n", "6 5.07 4.56 4.61 7.19 8.11 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 8.75 207.33 1 12 \n", "1 8.68 204.90 12 12 \n", "2 8.82 203.88 1 12 \n", "3 8.78 203.55 12 12 \n", "4 8.90 203.24 0 \n", "5 8.80 202.92 12 1 \n", "6 9.01 197.16 12 1 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 PASSIVE None True \n", "2 PASSIVE None True \n", "3 ACTIVE None True \n", "4 None True \n", "5 PASSIVE None True \n", "6 ACTIVE None True " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From above result, we can see that latency is very close for different settings. The default setting (intra_op_num_threads=0, OMP_NUM_THREADS and OMP_WAIT_POLICY does not exist) performs the best. \n", "\n", "### Model Results Comparison Tool\n", "\n", "When a BERT model is optimized, some approximation is used in calculation. If your BERT model has three inputs, a script compare_bert_results.py can be used to do a quick verification. The tool will generate some fake input data, and compare the inference outputs of the original and optimized models. If outputs are all close, it is safe to use the optimized model.\n", "\n", "For GPU inference, the absolute or relative difference is larger than those numbers of CPU inference. Note that slight difference in output will not impact final result. We did end-to-end evaluation using SQuAD data set using a fine-tuned squad model, and F1 score is almost the same before/after optimization." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n", "maximum absolute difference=1.9222497940063477e-06\r\n", "maximum relative difference=0.05027933046221733\r\n" ] } ], "source": [ "!python -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Model Optimization with Float16\n", "\n", "The optimizer.py script have an option **--float16** to convert model to use float16 to store weights. After the conversion, it could be faster to run in GPU with tensor cores like V100 or T4.\n", "\n", "Let's run tools to measure the performance on V100. The results show significant performance improvement: latency is about 3.4 ms for float32 model, and 1.8 ms for float16 model." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.90 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.12 ms, Throughput = 320.00 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.02 ms, Throughput = 331.39 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 332.53 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 328.67 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.72 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 329.32 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
03.012.792.812.865.087.16332.53112ACTIVENoneTrue
13.012.802.812.884.527.05331.900NoneTrue
23.012.782.802.925.017.02331.721212ACTIVENoneTrue
33.022.792.802.856.347.04331.39121ACTIVENoneTrue
43.042.802.822.935.567.08329.321212PASSIVENoneTrue
53.042.792.812.926.377.08328.67121PASSIVENoneTrue
63.122.792.822.966.667.20320.00112PASSIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.01 2.79 2.81 2.86 5.08 \n", "1 3.01 2.80 2.81 2.88 4.52 \n", "2 3.01 2.78 2.80 2.92 5.01 \n", "3 3.02 2.79 2.80 2.85 6.34 \n", "4 3.04 2.80 2.82 2.93 5.56 \n", "5 3.04 2.79 2.81 2.92 6.37 \n", "6 3.12 2.79 2.82 2.96 6.66 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 7.16 332.53 1 12 \n", "1 7.05 331.90 0 \n", "2 7.02 331.72 12 12 \n", "3 7.04 331.39 12 1 \n", "4 7.08 329.32 12 12 \n", "5 7.08 328.67 12 1 \n", "6 7.20 320.00 1 12 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 None True \n", "2 ACTIVE None True \n", "3 ACTIVE None True \n", "4 PASSIVE None True \n", "5 PASSIVE None True \n", "6 PASSIVE None True " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Throughput Tuning\n", "\n", "Some application need best throughput under some constraint on latency. This can be done by testing performance of different batch sizes. The tool could help on this.\n", "\n", "Here is an example that check the performance of multiple batch sizes (1, 2, 4, 8, 16, 32 and 64) using default settings." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=32 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 16.17 ms, Throughput = 1979.41 QPS\n", "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.00 ms, Throughput = 333.83 QPS\n", "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=2 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.59 ms, Throughput = 557.32 QPS\n", "test setting TestSetting(batch_size=64, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=64 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=64,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 29.26 ms, Throughput = 2187.15 QPS\n", "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=4 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.32 ms, Throughput = 926.92 QPS\n", "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=8 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 6.32 ms, Throughput = 1266.63 QPS\n", "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=16 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 9.60 ms, Throughput = 1666.05 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $THREAD_SETTING $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)batch_size
03.002.792.812.864.377.08333.831
13.593.333.353.426.607.54557.322
24.323.984.014.647.238.11926.924
36.325.945.977.618.9610.121266.638
49.609.229.2511.3212.3313.341666.0516
516.1715.8015.9017.3818.8019.931979.4132
629.2628.8929.0130.6332.5333.282187.1564
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.00 2.79 2.81 2.86 4.37 \n", "1 3.59 3.33 3.35 3.42 6.60 \n", "2 4.32 3.98 4.01 4.64 7.23 \n", "3 6.32 5.94 5.97 7.61 8.96 \n", "4 9.60 9.22 9.25 11.32 12.33 \n", "5 16.17 15.80 15.90 17.38 18.80 \n", "6 29.26 28.89 29.01 30.63 32.53 \n", "\n", " Latency_P99 Throughput(QPS) batch_size \n", "0 7.08 333.83 1 \n", "1 7.54 557.32 2 \n", "2 8.11 926.92 4 \n", "3 10.12 1266.63 8 \n", "4 13.34 1666.05 16 \n", "5 19.93 1979.41 32 \n", "6 33.28 2187.15 64 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float16 model summary from\", latest_result_file)\n", "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'warmup', 'sequence_length']\n", "columns_to_remove.extend(['intra_op_num_threads', 'OMP_NUM_THREADS', 'OMP_WAIT_POLICY', 'contiguous'])\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Additional Info\n", "\n", "Note that running Jupyter Notebook has significant impact on performance result. You can close Jupyter Notebook and other applications, then run the performance test in a console to get more accurate performance numbers.\n", "\n", "We have a [benchmark script](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/run_benchmark.sh). It is recommended to use it measure inference speed of OnnxRuntime.\n", "\n", "[OnnxRuntime C API](https://github.com/microsoft/onnxruntime/blob/master/docs/C_API.md) could get slightly better performance than python API. If you use C API in inference, you can use OnnxRuntime_Perf_Test.exe built from source to measure performance instead.\n", "\n", "Here is the machine configuration that generated the above results. You might get slower or faster result according to your hardware." ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\r\n", " \"gpu\": {\r\n", " \"driver_version\": \"440.64.00\",\r\n", " \"devices\": [\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 14110883840,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " },\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 16932601856,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " }\r\n", " ]\r\n", " },\r\n", " \"cpu\": {\r\n", " \"brand\": \"Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\",\r\n", " \"cores\": 12,\r\n", " \"logical_cores\": 12,\r\n", " \"hz\": \"2.5940 GHz\",\r\n", " \"l2_cache\": \"256 KB\",\r\n", " \"l3_cache\": \"35840 KB\",\r\n", " \"processor\": \"x86_64\"\r\n", " },\r\n", " \"memory\": {\r\n", " \"total\": 236645588992,\r\n", " \"available\": 222567559168\r\n", " },\r\n", " \"python\": \"3.7.7.final.0 (64 bit)\",\r\n", " \"os\": \"Linux-4.15.0-1089-azure-x86_64-with-debian-stretch-sid\",\r\n", " \"onnxruntime\": {\r\n", " \"version\": \"1.3.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"pytorch\": {\r\n", " \"version\": \"1.5.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"tensorflow\": null\r\n", "}\r\n" ] } ], "source": [ "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent" ] } ], "metadata": { "kernelspec": { "display_name": "PyCharm (ccks_ner-master)", "language": "python", "name": "pycharm-de4c0941" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: code/bert-base-count5/finetuning/Config.py ================================================ from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig, ElectraModel, ElectraConfig, ElectraTokenizer, \ RobertaTokenizer, RobertaModel, RobertaConfig from NEZHA.modeling_nezha import NeZhaModel from NEZHA.configuration_nezha import NeZhaConfig MODELS = { 'BertForClass': BertModel, 'BertForClass_MultiDropout': BertModel, 'BertLastTwoCls': BertModel, 'BertLastCls':BertModel, 'BertLastTwoClsPooler': BertModel, 'BertLastTwoEmbeddings': BertModel, 'BertLastTwoEmbeddingsPooler': BertModel, 'BertLastFourCls': BertModel, 'BertLastFourClsPooler': BertModel, 'BertLastFourEmbeddings': BertModel, 'BertLastFourEmbeddingsPooler': BertModel, 'BertDynCls': BertModel, 'BertDynEmbeddings': BertModel, 'BertRNN': BertModel, 'BertCNN': XLNetModel, 'BertRCNN': BertModel, 'XLNet': XLNetModel, 'Electra': ElectraModel, 'NEZHA': NeZhaModel } TOKENIZERS = { 'BertForClass': BertTokenizer, 'BertForClass_MultiDropout': BertTokenizer, 'BertLastTwoCls': BertTokenizer, 'BertLastCls': BertTokenizer, 'BertLastTwoClsPooler': BertTokenizer, 'BertLastTwoEmbeddings': BertTokenizer, 'BertLastTwoEmbeddingsPooler': BertTokenizer, 'BertLastFourCls': BertTokenizer, 'BertLastFourClsPooler': BertTokenizer, 'BertLastFourEmbeddings': BertTokenizer, 'BertLastFourEmbeddingsPooler': BertTokenizer, 'BertDynCls': BertTokenizer, 'BertDynEmbeddings': BertTokenizer, 'BertRNN': BertTokenizer, 'BertCNN': BertTokenizer, 'BertRCNN': BertTokenizer, 'XLNet': XLNetTokenizer, 'Electra': ElectraTokenizer, 'NEZHA': BertTokenizer } CONFIGS = { 'BertForClass': BertConfig, 'BertForClass_MultiDropout': BertConfig, 'BertLastTwoCls': BertConfig, 'BertLastCls': BertConfig, 'BertLastTwoClsPooler': BertConfig, 'BertLastTwoEmbeddings': BertConfig, 'BertLastTwoEmbeddingsPooler': BertConfig, 'BertLastFourCls': BertConfig, 'BertLastFourClsPooler': BertConfig, 'BertLastFourEmbeddings': BertConfig, 'BertLastFourEmbeddingsPooler': BertConfig, 'BertDynCls': BertConfig, 'BertDynEmbeddings': BertConfig, 'BertRNN': BertConfig, 'BertCNN': BertConfig, 'BertRCNN': BertConfig, 'XLNet': XLNetConfig, 'Electra': ElectraConfig, 'NEZHA': NeZhaConfig } ================================================ FILE: code/bert-base-count5/finetuning/NEZHA/configuration_nezha.py ================================================ from transformers import PretrainedConfig NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class NeZhaConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. It is used to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALBERT `xxlarge `__ architecture. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30000): Vocabulary size of the ALBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of vocabulary embeddings. hidden_size (:obj:`int`, optional, defaults to 4096): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_hidden_groups (:obj:`int`, optional, defaults to 1): Number of groups for the hidden layers, parameters in the same group are shared. num_attention_heads (:obj:`int`, optional, defaults to 64): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 16384): The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. inner_group_num (:obj:`int`, optional, defaults to 1): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for attached classifiers. Example:: from transformers import AlbertConfig, AlbertModel # Initializing an ALBERT-xxlarge style configuration albert_xxlarge_configuration = AlbertConfig() # Initializing an ALBERT-base style configuration albert_base_configuration = AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) # Initializing a model from the ALBERT-base style configuration model = AlbertModel(albert_xxlarge_configuration) # Accessing the model configuration configuration = model.config Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "nezha" def __init__( self, vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, num_hidden_groups=1, num_attention_heads=64, intermediate_size=16384, inner_group_num=1, hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, max_relative_position=64, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, use_relative_position=True, pad_token_id=0, bos_token_id=2, eos_token_id=3, **kwargs ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_hidden_groups = num_hidden_groups self.num_attention_heads = num_attention_heads self.inner_group_num = inner_group_num self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.max_relative_position = max_relative_position self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_relative_position=use_relative_position self.classifier_dropout_prob = classifier_dropout_prob ================================================ FILE: code/bert-base-count5/finetuning/NEZHA/modeling_nezha.py ================================================ import math import os import logging import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_nezha import NeZhaConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.modeling_utils import PreTrainedModel, prune_linear_layer from transformers.models.bert.modeling_bert import ( BertOutput, BertPooler, BertSelfOutput, BertIntermediate, BertOnlyMLMHead, BertOnlyNSPHead, BertPreTrainingHeads, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING, ) logger = logging.getLogger(__name__) _CONFIG_FOR_DOC = "NeZhaConfig" _TOKENIZER_FOR_DOC = "NeZhaTokenizer" NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [] NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {} def load_tf_weights_in_nezha(model, config, tf_checkpoint_path): """Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: # logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step", "good_steps", "loss_scale", 'bad_steps'] for n in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert ( pointer.shape == array.shape ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model class NeZhaEmbeddings(nn.Module): """ Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.use_relative_position = config.use_relative_position self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings def relative_position_encoding(depth, max_length=512, max_relative_position=127): vocab_size = max_relative_position * 2 + 1 range_vec = torch.arange(max_length) range_mat = range_vec.repeat(max_length).view(max_length, max_length) distance_mat = range_mat - torch.t(range_mat) distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) final_mat = distance_mat_clipped + max_relative_position embeddings_table = torch.zeros(vocab_size, depth) position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) embeddings_table[:, 0::2] = torch.sin(position * div_term) embeddings_table[:, 1::2] = torch.cos(position * div_term) embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1) flat_relative_positions_matrix = final_mat.view(-1) one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix, num_classes=vocab_size).float() positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) my_shape = list(final_mat.size()) my_shape.append(depth) positions_encoding = positions_encoding.view(my_shape) return positions_encoding class NeZhaSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings, depth=self.attention_head_size, max_relative_position=config.max_relative_position).to('cuda') def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. if encoder_hidden_states is not None: mixed_key_layer = self.key(encoder_hidden_states) mixed_value_layer = self.value(encoder_hidden_states) attention_mask = encoder_attention_mask else: mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] query_layer_t = query_layer.permute(2, 0, 1, 3) query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, self.attention_head_size) key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1)) key_position_scores_r = key_position_scores.view(from_seq_length, batch_size, num_attention_heads, from_seq_length) key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) attention_scores = attention_scores + key_position_scores_r_t attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] attention_probs_t = attention_probs.permute(2, 0, 1, 3) attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, to_seq_length) value_position_scores = torch.matmul(attentions_probs_r, relations_values) value_position_scores_r = value_position_scores.view(from_seq_length, batch_size, num_attention_heads, self.attention_head_size) value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) context_layer = context_layer + value_position_scores_r_t context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class NeZhaAttention(nn.Module): def __init__(self, config): super().__init__() self.self = NeZhaSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class NeZhaLayer(nn.Module): def __init__(self, config): super().__init__() self.attention = NeZhaAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = NeZhaAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights if self.is_decoder and encoder_hidden_states is not None: cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) outputs = (layer_output,) + outputs return outputs class NeZhaEncoder(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)]) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask ) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class NeZhaPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = NeZhaConfig pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_nezha base_model_prefix = "bert" def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class NeZhaModel(NeZhaPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration set to :obj:`True`; an :obj:`encoder_hidden_states` is expected as an input to the forward pass. .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762 """ def __init__(self, config): super().__init__(config) self.config = config self.embeddings = NeZhaEmbeddings(config) self.encoder = NeZhaEncoder(config) self.pooler = BertPooler(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, self.device ) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class NeZhaForPreTraining(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertPreTrainingHeads(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, next_sentence_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForPreTraining import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) # add hidden states and attention if they are here outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] if labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class NeZhaForMaskedLM(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): Next token prediction loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMaskedLM import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here # Although this may seem awkward, BertForMaskedLM supports two scenarios: # 1. If a tensor that contains the indices of masked labels is provided, # the cross-entropy is the MLM cross-entropy that measures the likelihood # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. masked_lm_labels = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # if model is does not use a causal mask then add a dummy token if self.config.is_decoder is False: assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" attention_mask = torch.cat( [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 ) dummy_token = torch.full( (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device ) input_ids = torch.cat([input_ids, dummy_token], dim=1) return {"input_ids": input_ids, "attention_mask": attention_mask} @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, next_sentence_label=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForNextSentencePrediction import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) seq_relationship_scores = outputs[0] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if next_sentence_label is not None: loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) outputs = (next_sentence_loss,) + outputs return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class NeZhaForSequenceClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class NeZhaForMultipleChoice(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMultipleChoice import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class NeZhaForTokenClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForTokenClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class NeZhaForQuestionAnswering(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, position_ids=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForQuestionAnswering import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) assert answer == "a nice puppet" """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/finetuning/model.py ================================================ import torch import random import os from torch import nn, optim import torch.nn.functional as F from transformers.activations import get_activation from Config import * class BertForClass(nn.Module): def __init__(self, config): super(BertForClass, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit class BertForClass_MultiDropout(nn.Module): def __init__(self, config): super(BertForClass_MultiDropout, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.multi_drop = 5 self.multi_dropouts = nn.ModuleList([nn.Dropout(config.dropout) for _ in range(self.multi_drop)]) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) for j, dropout in enumerate(self.multi_dropouts): if j == 0: logit = self.classifier(dropout(concat_out)) / self.multi_drop else: logit += self.classifier(dropout(concat_out)) / self.multi_drop return logit class BertLastTwoCls(nn.Module): def __init__(self, config): super(BertLastTwoCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) logit = self.classifier(pooler_output) return logit class BertLastCls(nn.Module): def __init__(self, config): super(BertLastCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] if self.isDropout: output = self.dropout(pooler_output) logit = self.classifier(output) return logit class BertLastTwoClsPooler(nn.Module): def __init__(self, config): super(BertLastTwoClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddings(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourCls(nn.Module): def __init__(self, config): super(BertLastFourCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] output = torch.cat( (hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourClsPooler(nn.Module): def __init__(self, config): super(BertLastFourClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddings(nn.Module): def __init__(self, config): super(BertLastFourEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastFourEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertDynCls(nn.Module): def __init__(self, config): super(BertDynCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = hidden_states[-(i + 1)][0] weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertDynEmbeddings(nn.Module): def __init__(self, config): super(BertDynEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = torch.mean(hidden_states[-(i + 1)], dim=1) weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertRNN(nn.Module): def __init__(self, config): super(BertRNN, self).__init__() self.rnn_type = "gru" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.1 self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.num_directions = 1 if not self.bidirectional else 2 if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) self.dropout = nn.Dropout(self.drop_out) self.fc_rnn = nn.Linear(self.hidden_dim * self.num_directions, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(sequence_output) else: output, (hidden, cell) = self.rnn(sequence_output) # output = [ batch size, sent len, hidden_dim * bidirectional] batch_size, max_seq_len, hidden_dim = output.shape hidden = torch.transpose(hidden, 1, 0) hidden = torch.mean(torch.reshape(hidden, [batch_size, -1, hidden_dim]), dim=1) output = torch.sum(output, dim=1) fc_input = self.dropout(output + hidden) # output = torch.mean(output, dim=1) # fc_input = self.dropout(output) out = self.fc_rnn(fc_input) return out class BertCNN(nn.Module): def __init__(self, config): super(BertCNN, self).__init__() self.num_filters = 100 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.hidden_size = self.bert_config.to_dict()['hidden_size'] self.filter_sizes = {3, 4, 5} self.drop_out = 0.5 self.convs = nn.ModuleList( [nn.Conv2d(1, self.num_filters, (k, self.hidden_size)) for k in self.filter_sizes]) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dropout = nn.Dropout(self.drop_out) self.fc_cnn = nn.Linear(self.num_filters * len(self.filter_sizes), config.num_class) def conv_and_pool(self, x, conv): x = F.relu(conv(x)).squeeze(3) x = F.max_pool1d(x, x.size(2)).squeeze(2) return x def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = self.dropout(sequence_output) out = sequence_output.unsqueeze(1) out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) out = self.dropout(out) out = self.fc_cnn(out) return out class BertRCNN(nn.Module): def __init__(self, config): super(BertRCNN, self).__init__() self.rnn_type = "lstm" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.5 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) # self.maxpool = nn.MaxPool1d() self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.fc = nn.Linear(self.hidden_dim * self.n_layers, config.num_class) self.dropout = nn.Dropout(self.drop_out) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sentence_len = sequence_output.shape[1] pooler_output = pooler_output.unsqueeze(dim=1).repeat(1, sentence_len, 1) bert_sentence = sequence_output + pooler_output self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(bert_sentence) else: output, (hidden, cell) = self.rnn(bert_sentence) batch_size, max_seq_len, hidden_dim = output.shape out = torch.transpose(output.relu(), 1, 2) out = F.max_pool1d(out, max_seq_len).squeeze() out = self.fc(out) return out class XLNet(nn.Module): def __init__(self, config): super(XLNet, self).__init__() self.xlnet = XLNetModel.from_pretrained(config.model_path) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.fc = nn.Linear(self.xlnet.d_model, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output = self.xlnet(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = torch.sum(sequence_output[0], dim=1) if self.isDropout: sequence_output = self.dropout(sequence_output) out = self.fc(sequence_output) return out class ElectraClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x class Electra(nn.Module): def __init__(self, config): super(Electra, self).__init__() self.electra = ElectraModel.from_pretrained(config.model_path) config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.electra_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) self.electra_config.num_labels = config.num_class self.fc = ElectraClassificationHead(self.electra_config) def forward(self, input_ids, input_masks, segment_ids): discriminator_hidden_states = self.electra(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = discriminator_hidden_states[0] out = self.fc(sequence_output) return out class NEZHA(nn.Module): def __init__(self, config): super(NEZHA, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) #self.bert_model = MODELS[config.model](config=self.bert_config) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) # NEZHA init #torch_init_model(self.bert_model, os.path.join(config.model_path, 'pytorch_model.bin')) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit ================================================ FILE: code/bert-base-count5/finetuning/models/gitkeep ================================================ ================================================ FILE: code/bert-base-count5/finetuning/multi_gpu_QA.py ================================================ from tqdm import tqdm, trange import numpy as np import pandas as pd import logging import torch import random import os from torch import nn, optim from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig from transformers.optimization import get_linear_schedule_with_warmup from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score from model import * from utils import * import time import logging logging.basicConfig(level=logging.DEBUG, filename="train.log",filemode='a') from NEZHA.modeling_nezha import * MODEL_CLASSES = { 'BertForClass': BertForClass, 'BertLastCls': BertLastCls, 'BertLastTwoCls': BertLastTwoCls, 'BertLastTwoClsPooler': BertLastTwoClsPooler, 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 'BertLastFourCls': BertLastFourCls, 'BertLastFourClsPooler': BertLastFourClsPooler, 'BertLastFourEmbeddings': BertLastFourEmbeddings, 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 'BertDynCls': BertDynCls, 'BertDynEmbeddings': BertDynEmbeddings, 'BertRNN': BertRNN, 'BertCNN': BertCNN, 'BertRCNN': BertRCNN, 'XLNet': XLNet, 'Electra': Electra, 'NEZHA': NEZHA, } class Config: def __init__(self): # 预训练模型路径 self.modelId = 2 self.model = "BertForClass" self.Stratification = False self.model_path = '../pretrain/bert_model/' self.num_class = 2 self.dropout = 0.2 self.MAX_LEN = 100 self.epoch = 3 self.learn_rate = 2e-5 self.normal_lr = 1e-4 self.batch_size = 32 self.k_fold = 10 self.seed = 42 self.device = torch.device('cuda') # self.device = torch.device('cpu') self.focalloss = False self.pgd = False self.fgm = True config = Config() os.environ['PYTHONHASHSEED']='0'#消除hash算法的随机性 random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) file_path = './log/' # 创建一个logger logger = logging.getLogger('mylogger') logger.setLevel(logging.DEBUG) train = pd.read_csv('/tcdata/gaiic_track3_round1_train_20210228.tsv',sep='\t',header=None) semi = pd.read_csv('/tcdata/gaiic_track3_round2_train_20210407.tsv',sep='\t',header=None) train = pd.concat([train, semi], sort=False) train.columns=['q1','q2','label'] train_query1 = train['q1'].values.astype(str) train_query2 = train['q2'].values.astype(str) train_label = train['label'].values.astype(int) oof_train = np.zeros((len(train), config.num_class), dtype=np.float32) #kf = StratifiedKFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) kf = KFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) for fold, (train_index, valid_index) in enumerate(kf.split(train_query1, train_label)): print('\n\n------------fold:{}------------\n'.format(fold)) ''' q1 = train_query1[train_index] q2 = train_query2[train_index] y = train_label[train_index] ''' q1 = train_query1 q2 = train_query2 y = train_label val_q1 = train_query1[valid_index] val_q2 = train_query2[valid_index] val_y = train_label[valid_index] train_D = data_generator([q1, q2, y], config, shuffle=True) val_D = data_generator([val_q1, val_q2, val_y], config) model = MODEL_CLASSES[config.model](config).to(config.device) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) if config.pgd: pgd = PGD(model) K = 3 elif config.fgm: fgm = FGM(model) if config.focalloss: loss_fn = FocalLoss(config.num_class) else: loss_fn = nn.CrossEntropyLoss() # BCEWithLogitsLoss就是把Sigmoid-BCELoss合成一步 num_train_steps = int(len(train) / config.batch_size * config.epoch) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if config.Stratification: bert_params = [x for x in param_optimizer if 'bert' in x[0]] normal_params = [p for n, p in param_optimizer if 'bert' not in n] optimizer_parameters = [ {'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': normal_params, 'lr': config.normal_lr}, ] else: optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr=config.learn_rate) # lr为全局学习率 scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(len(train) / config.batch_size / 2), num_training_steps=num_train_steps ) best_auc = 0 PATH = './models/bert_{}.pth'.format(fold) save_model_path = './models/' if not os.path.exists(save_model_path): os.makedirs(save_model_path) for e in range(config.epoch): print('\n------------epoch:{}------------'.format(e)) model.train() acc = 0 train_len = 0 loss_num = 0 tq = tqdm(train_D,ncols=70,disable=True) last=time.time() for input_ids, input_masks, segment_ids, labels in tq: label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) loss = loss_fn(y_pred, label_t) loss = loss.mean() loss.backward() if config.pgd: pgd.backup_grad() # 对抗训练 for t in range(K): pgd.attack(is_first_attack=(t == 0)) # 在embedding上添加对抗扰动, first attack时备份param.data if t != K - 1: model.zero_grad() else: pgd.restore_grad() y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 elif config.fgm: # 对抗训练 fgm.attack() # 在embedding上添加对抗扰动 y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 # 梯度下降,更新参数 optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() y_pred = np.argmax(y_pred.detach().to("cpu").numpy(), axis=1) acc += sum(y_pred == labels) loss_num += loss.item() train_len += len(labels) tq.set_postfix(fold=fold, epoch=e, loss=loss_num / train_len, acc=acc / train_len) print(f"微调第{e}轮耗时:{time.time()-last}") model.eval() with torch.no_grad(): y_p = [] y_l = [] train_logit = None for input_ids, input_masks, segment_ids, labels in tqdm(val_D,disable=True): label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) y_pred = F.softmax(y_pred) y_pred = y_pred.detach().to("cpu").numpy() if train_logit is None: train_logit = y_pred else: train_logit = np.vstack((train_logit, y_pred)) y_p += list(y_pred[:,1]) y_pred = np.argmax(y_pred, axis=1) y_l += list(y_pred) f1 = f1_score(val_y, y_l, average="macro") auc_score = roc_auc_score(val_y, y_p) print("best_auc:{} auc_score:{} f1:{}\n".format(best_auc, auc_score, f1)) if auc_score >= best_auc: best_auc = auc_score oof_train[valid_index] = np.array(train_logit) #torch.save(model.module.state_dict() if hasattr(model, "module") else model.state_dict(), PATH) torch.save(model.module if hasattr(model, "module") else model, PATH) optimizer.zero_grad() del model torch.cuda.empty_cache() break ================================================ FILE: code/bert-base-count5/finetuning/utils.py ================================================ import torch from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig import numpy as np import os import random from Config import * import torch import torch.nn as nn import torch.nn.functional as F def paddingList(ls:list,val,returnTensor=False): ls=ls[:]#不要改变了原list尺寸 maxLen=max([len(i) for i in ls]) for i in range(len(ls)): ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) return torch.tensor(ls,device='cuda') if returnTensor else ls def fastTokenizer(a:str,b:str,maxLen,tk): a,b=a.split(),b.split() a,b=tk.convert_tokens_to_ids(a),tk.convert_tokens_to_ids(b) maxLen-=3#空留给cls sep sep assert maxLen>=0 len2=maxLen//2#若为奇数,更长部分给左边 len1=maxLen-len2 #一共就a超长与否,b超长与否,组合的四种情况 if len(a)+len(b)>maxLen:#需要截断 if len(a)<=len1 and len(b)>len2: b=b[:maxLen-len(a)] elif len(a)>len1 and len(b)<=len2: a=a[:maxLen-len(b)] elif len(a)>len1 and len(b)>len2: a=a[:len1] b=b[:len2] input_ids=[tk.cls_token_id]+a+[tk.sep_token_id]+b+[tk.sep_token_id] token_type_ids=[0]*(len(a)+2)+[1]*(len(b)+1) return {'input_ids': input_ids, 'token_type_ids': token_type_ids} class data_generator: def __init__(self, data, config, shuffle=False): self.data = data self.batch_size = config.batch_size self.max_length = config.MAX_LEN self.shuffle = shuffle vocab = 'vocab.txt' if os.path.exists(config.model_path + 'vocab.txt') else 'spiece.model' self.tokenizer = TOKENIZERS[config.model].from_pretrained(config.model_path + vocab) self.steps = len(self.data[0]) // self.batch_size if len(self.data[0]) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self): q1, q2, y = self.data idxs = list(range(len(self.data[0]))) if self.shuffle: np.random.shuffle(idxs) input_ids, input_masks, segment_ids, labels = [], [], [], [] for index, i in enumerate(idxs): text = q1[i] text_pair = q2[i] ''' # text = self.tokenizer(text, text_pair, padding='max_length', truncation=True, max_length=self.max_length) text = fastTokenizer(text, text_pair, self.max_length, self.tokenizer) input_ids.append(text['input_ids']) segment_ids.append(text['token_type_ids']) input_masks.append([1] * len(text['input_ids'])) # bs为1时无padding,全1 yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] ''' tkRes = self.tokenizer(text, text_pair, max_length=self.max_length, truncation='longest_first', return_attention_mask=False) input_id = tkRes['input_ids'] segment_id = tkRes['token_type_ids'] assert len(segment_id) == len(input_id) input_ids.append(input_id) segment_ids.append(segment_id) labels.append(y[i]) if len(input_ids) == self.batch_size or i == idxs[-1]: input_ids = paddingList(input_ids, 0, returnTensor=True) # 动态padding segment_ids = paddingList(segment_ids, 0, returnTensor=True) input_masks = (input_ids != 0) yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] class PGD(): def __init__(self, model): self.model = model self.emb_backup = {} self.grad_backup = {} def attack(self, epsilon=0.3, alpha=0.1, emb_name='word_embeddings', is_first_attack=False): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: if is_first_attack: self.emb_backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0 and not torch.isnan(norm): r_at = alpha * param.grad / norm param.data.add_(r_at) param.data = self.project(name, param.data, epsilon) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.emb_backup param.data = self.emb_backup[name] self.emb_backup = {} def project(self, param_name, param_data, epsilon): r = param_data - self.emb_backup[param_name] if torch.norm(r) > epsilon: r = epsilon * r / torch.norm(r) return self.emb_backup[param_name] + r def backup_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: self.grad_backup[name] = param.grad.clone() def restore_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: param.grad = self.grad_backup[name] class FGM(): def __init__(self, model): self.model = model self.backup = {} def attack(self, epsilon=0.25, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: self.backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0: r_at = epsilon * param.grad / norm param.data.add_(r_at) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.backup param.data = self.backup[name] self.backup = {} # 支持多分类和二分类 class FocalLoss(nn.Module): """ This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' Focal_Loss= -1*alpha*(1-pt)^gamma*log(pt) :param num_class: :param alpha: (tensor) 3D or 4D the scalar factor for this criterion :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more focus on hard misclassified example :param smooth: (float,double) smooth value when cross entropy :param balance_index: (int) balance class index, should be specific when alpha is float :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch. """ def __init__(self, num_class, alpha=None, gamma=2, smooth=None, size_average=True): super(FocalLoss, self).__init__() self.num_class = num_class self.alpha = alpha self.gamma = gamma self.smooth = smooth self.size_average = size_average if self.alpha is None: self.alpha = torch.ones(self.num_class, 1) elif isinstance(self.alpha, (list, np.ndarray)): assert len(self.alpha) == self.num_class self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1) self.alpha = self.alpha / self.alpha.sum() else: raise TypeError('Not support alpha type') if self.smooth is not None: if self.smooth < 0 or self.smooth > 1.0: raise ValueError('smooth value should be in [0,1]') def forward(self, input, target): logit = F.softmax(input, dim=1) if logit.dim() > 2: # N,C,d1,d2 -> N,C,m (m=d1*d2*...) logit = logit.view(logit.size(0), logit.size(1), -1) logit = logit.permute(0, 2, 1).contiguous() logit = logit.view(-1, logit.size(-1)) target = target.view(-1, 1) # N = input.size(0) # alpha = torch.ones(N, self.num_class) # alpha = alpha * (1 - self.alpha) # alpha = alpha.scatter_(1, target.long(), self.alpha) epsilon = 1e-10 alpha = self.alpha if alpha.device != input.device: alpha = alpha.to(input.device) idx = target.cpu().long() one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() one_hot_key = one_hot_key.scatter_(1, idx, 1) if one_hot_key.device != logit.device: one_hot_key = one_hot_key.to(logit.device) if self.smooth: one_hot_key = torch.clamp( one_hot_key, self.smooth, 1.0 - self.smooth) pt = (one_hot_key * logit).sum(1) + epsilon logpt = pt.log() gamma = self.gamma alpha = alpha[idx] loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt if self.size_average: loss = loss.mean() else: loss = loss.sum() return loss def f1_match(y_true,y_pred): acc = sum(y_pred & y_true) / (sum(y_pred)) rec = sum(y_pred & y_true) / (sum(y_true)) return 2 * acc * rec /(acc + rec) ================================================ FILE: code/bert-base-count5/pretrain/NLP_Utils.py ================================================ import random import json import transformers as _ from transformers1 import BertTokenizer import torch from torch.utils.data import Dataset,DataLoader import numpy as np from itertools import chain def writeToJsonFile(path: str, obj): with open(path, "w", encoding="utf-8") as f: f.write(json.dumps(obj, ensure_ascii=False,indent=0)) def readFromJsonFile(path: str): with open(path, "r", encoding="utf-8") as f: return json.loads(f.read()) def loadData(path): allData=[] with open(path,"r") as f: for i in f: i=i.strip().split('\t') if len(i)==0:#防止空行 break if len(i)==3:#训练集 a,b,label=i a=a.split(' ') b=b.split(' ') else:#测试集,直接转为id形式 a,b,label=i[0],i[1],-1 a=a.split(' ') b=b.split(' ') allData.append([a,b,label]) return allData def calNegPos(ls):#计算正负比例 posNum,negNum=0,0 for i in ls: if i[2]==0: negNum+=1 elif i[2]==1: posNum+=1 posNum=1 if posNum==0 else posNum return negNum,posNum,round(negNum/posNum,4) allData=loadData('/tcdata/gaiic_track3_round1_train_20210228.tsv')+loadData('/tcdata/gaiic_track3_round2_train_20210407.tsv') testA_data = loadData('/tcdata/gaiic_track3_round1_testA_20210228.tsv') testB_data = loadData('/tcdata/gaiic_track3_round1_testB_20210317.tsv') random.shuffle(allData) train_data=allData+testA_data+testB_data#全量 valid_data=allData[-20000:] print("训练集样本数量:", len(train_data)) def paddingList(ls:list,val,returnTensor=False): ls=ls[:]#不要改变了原list尺寸 maxLen=max([len(i) for i in ls]) for i in range(len(ls)): ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) return torch.tensor(ls,device='cuda') if returnTensor else ls def truncate(a:list,b:list,maxLen): maxLen-=3#空留给cls sep sep assert maxLen>=0 len2=maxLen//2#若为奇数,更长部分给左边 len1=maxLen-len2 #一共就a超长与否,b超长与否,组合的四种情况 if len(a)+len(b)>maxLen:#需要截断 if len(a)<=len1 and len(b)>len2: b=b[:maxLen-len(a)] elif len(a)>len1 and len(b)<=len2: a=a[:maxLen-len(b)] elif len(a)>len1 and len(b)>len2: a=a[:len1] b=b[:len2] return a,b class MLM_Data(Dataset): #传入句子对列表 def __init__(self,textLs:list,maxLen:int,tk:BertTokenizer): super().__init__() self.data=textLs self.maxLen=maxLen self.tk=tk self.spNum=len(tk.all_special_tokens) self.tkNum=tk.vocab_size def __len__(self): return len(self.data) def random_mask(self,text_ids): input_ids, output_ids = [], [] rands = np.random.random(len(text_ids)) idx=0 while idx0.5: text1,text2=text2,text1#交换位置 text1,text2=truncate(text1,text2,self.maxLen) text1_ids,text2_ids = self.tk.convert_tokens_to_ids(text1),self.tk.convert_tokens_to_ids(text2) text1_ids, out1_ids = self.random_mask(text1_ids)#添加mask预测 text2_ids, out2_ids = self.random_mask(text2_ids) input_ids = [self.tk.cls_token_id] + text1_ids + [self.tk.sep_token_id] + text2_ids + [self.tk.sep_token_id]#拼接 token_type_ids=[0]*(len(text1_ids)+2)+[1]*(len(text2_ids)+1) labels = [-100] + out1_ids + [-100] + out2_ids + [-100] assert len(input_ids)==len(token_type_ids)==len(labels) return {'input_ids':input_ids,'token_type_ids':token_type_ids,'labels':labels} @classmethod def collate(cls,batch): input_ids=[i['input_ids'] for i in batch] token_type_ids=[i['token_type_ids'] for i in batch] labels=[i['labels'] for i in batch] input_ids=paddingList(input_ids,0,returnTensor=True) token_type_ids=paddingList(token_type_ids,0,returnTensor=True) labels=paddingList(labels,-100,returnTensor=True) attention_mask=(input_ids!=0) return {'input_ids':input_ids,'token_type_ids':token_type_ids ,'attention_mask':attention_mask,'labels':labels} unionList=lambda ls:list(chain(*ls))#按元素拼接 splitList=lambda x,bs:[x[i:i+bs] for i in range(0,len(x),bs)]#按bs切分 #sortBsNum:原序列按多少个bs块为单位排序,可用来增强随机性 #比如如果每次打乱后都全体一起排序,那每次都是一样的 def blockShuffle(data:list,bs:int,sortBsNum,key): random.shuffle(data)#先打乱 tail=len(data)%bs#计算碎片长度 tail=[] if tail==0 else data[-tail:] data=data[:len(data)-len(tail)] assert len(data)%bs==0#剩下的一定能被bs整除 sortBsNum=len(data)//bs if sortBsNum is None else sortBsNum#为None就是整体排序 data=splitList(data,sortBsNum*bs) data=[sorted(i,key=key,reverse=True) for i in data]#每个大块进行降排序 data=unionList(data) data=splitList(data,bs)#最后,按bs分块 random.shuffle(data)#块间打乱 data=unionList(data)+tail return data from torch.utils.data.dataloader import _SingleProcessDataLoaderIter,_MultiProcessingDataLoaderIter #每轮迭代重新分块shuffle数据的DataLoader class blockShuffleDataLoader(DataLoader): def __init__(self, dataset: Dataset,sortBsNum,key,**kwargs): assert isinstance(dataset.data,list)#需要有list类型的data属性 super().__init__(dataset,**kwargs)#父类的参数传过去 self.sortBsNum=sortBsNum self.key=key def __iter__(self): #分块shuffle self.dataset.data=blockShuffle(self.dataset.data,self.batch_size,self.sortBsNum,self.key) if self.num_workers == 0: return _SingleProcessDataLoaderIter(self) else: return _MultiProcessingDataLoaderIter(self) ================================================ FILE: code/bert-base-count5/pretrain/__init__.py ================================================ ================================================ FILE: code/bert-base-count5/pretrain/bert_model/gitkeep ================================================ ================================================ FILE: code/bert-base-count5/pretrain/train_bert.py ================================================ # coding:utf-8 import numpy as np import random import os random.seed(0) np.random.seed(0)#seed应该在main里尽早设置,以防万一 os.environ['PYTHONHASHSEED'] =str(0)#消除hash算法的随机性 from transformers import BertForMaskedLM#除nezha外模型用新版加载 from transformers1 import Trainer, TrainingArguments,BertTokenizer,BertConfig from NLP_Utils import MLM_Data,train_data,blockShuffleDataLoader maxlen=100 batch_size=128 vocab_file_dir = './bert_model/vocab.txt' tokenizer = BertTokenizer.from_pretrained(vocab_file_dir) config = BertConfig( vocab_size=len(tokenizer), hidden_size=768, num_hidden_layers=12, num_attention_heads=12, max_position_embeddings=512, ) # 把层数改为8层 model = BertForMaskedLM.from_pretrained('../../bert-base-chinese') model.resize_token_embeddings(len(tokenizer)) print(model) train_MLM_data=MLM_Data(train_data,maxlen,tokenizer) #自己定义dataloader,不要用huggingface的 dl=blockShuffleDataLoader(train_MLM_data,None,key=lambda x:len(x[0])+len(x[1]),shuffle=False ,batch_size=batch_size,collate_fn=train_MLM_data.collate) training_args = TrainingArguments( output_dir='./bert_output', overwrite_output_dir=True, num_train_epochs=400, per_device_train_batch_size=batch_size, save_steps=len(dl)*10000,#每10个epoch save一次 save_total_limit=3, logging_steps=len(dl),#每个epoch log一次 seed=2021, learning_rate=5e-5, lr_end=1e-5,#学习率衰减的终点 weight_decay=0.01, warmup_steps=int(450000*150/batch_size*0.03) ) trainer = Trainer( model=model, args=training_args, train_dataLoader=dl, prediction_loss_only=True, ) if __name__ == '__main__': trainer.train() trainer.save_model('./bert_model') ================================================ FILE: code/bert-base-count5/pretrain/transformers1/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. __version__ = "2.11.0" # Work around to update TensorFlow's absl.logging threshold which alters the # default Python logging output behavior when present. # see: https://github.com/abseil/abseil-py/issues/99 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493 try: import absl.logging except ImportError: pass else: absl.logging.set_verbosity("info") absl.logging.set_stderrthreshold("info") absl.logging._warn_preinit_stderr = False import logging # Configurations from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig from .configuration_bart import BartConfig from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig from .configuration_encoder_decoder import EncoderDecoderConfig from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig from .configuration_marian import MarianConfig from .configuration_mmbt import MMBTConfig from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig from .configuration_utils import PretrainedConfig from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig from .data import ( DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor, SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels, is_sklearn_available, squad_convert_examples_to_features, xnli_output_modes, xnli_processors, xnli_tasks_num_labels, ) # Files and general utilities from .file_utils import ( CONFIG_NAME, MODEL_CARD_NAME, PYTORCH_PRETRAINED_BERT_CACHE, PYTORCH_TRANSFORMERS_CACHE, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, TRANSFORMERS_CACHE, WEIGHTS_NAME, add_end_docstrings, add_start_docstrings, cached_path, is_tf_available, is_torch_available, ) from .hf_argparser import HfArgumentParser # Model Cards from .modelcard import ModelCard # TF 2.0 <=> PyTorch conversion utilities from .modeling_tf_pytorch_utils import ( convert_tf_weight_name_to_pt_weight_name, load_pytorch_checkpoint_in_tf2_model, load_pytorch_model_in_tf2_model, load_pytorch_weights_in_tf2_model, load_tf2_checkpoint_in_pytorch_model, load_tf2_model_in_pytorch_model, load_tf2_weights_in_pytorch_model, ) # Pipelines from .pipelines import ( CsvPipelineDataFormat, FeatureExtractionPipeline, FillMaskPipeline, JsonPipelineDataFormat, NerPipeline, PipedPipelineDataFormat, Pipeline, PipelineDataFormat, QuestionAnsweringPipeline, SummarizationPipeline, TextClassificationPipeline, TextGenerationPipeline, TokenClassificationPipeline, TranslationPipeline, pipeline, ) # Tokenizers from .tokenization_albert import AlbertTokenizer from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer from .tokenization_bart import BartTokenizer, MBartTokenizer from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer from .tokenization_camembert import CamembertTokenizer from .tokenization_ctrl import CTRLTokenizer from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast from .tokenization_flaubert import FlaubertTokenizer from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_reformer import ReformerTokenizer from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_t5 import T5Tokenizer from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast from .tokenization_utils import PreTrainedTokenizer from .tokenization_xlm import XLMTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer from .trainer_utils import EvalPrediction from .training_args import TrainingArguments from .training_args_tf import TFTrainingArguments logger = logging.getLogger(__name__) # pylint: disable=invalid-name if is_sklearn_available(): from .data import glue_compute_metrics, xnli_compute_metrics # Modeling if is_torch_available(): from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering, apply_chunking_to_forward from .modeling_auto import ( AutoModel, AutoModelForPreTraining, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelWithLMHead, AutoModelForTokenClassification, AutoModelForMultipleChoice, MODEL_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_MULTIPLE_CHOICE_MAPPING, ) from .modeling_bert import ( BertPreTrainedModel, BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering, load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_LIST, BertLayer, ) from .modeling_openai import ( OpenAIGPTPreTrainedModel, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_transfo_xl import ( TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, AdaptiveEmbedding, load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_gpt2 import ( GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_LIST from .modeling_xlnet import ( XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForTokenClassification, XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering, load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_xlm import ( XLMPreTrainedModel, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForTokenClassification, XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_bart import ( BartForSequenceClassification, BartModel, BartForConditionalGeneration, BART_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_marian import MarianMTModel from .tokenization_marian import MarianTokenizer from .modeling_roberta import ( RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification, RobertaForQuestionAnswering, ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_distilbert import ( DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_camembert import ( CamembertForMaskedLM, CamembertModel, CamembertForSequenceClassification, CamembertForMultipleChoice, CamembertForTokenClassification, CamembertForQuestionAnswering, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_encoder_decoder import EncoderDecoderModel from .modeling_t5 import ( T5PreTrainedModel, T5Model, T5ForConditionalGeneration, load_tf_weights_in_t5, T5_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_albert import ( AlbertPreTrainedModel, AlbertModel, AlbertForPreTraining, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering, AlbertForTokenClassification, load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_xlm_roberta import ( XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification from .modeling_flaubert import ( FlaubertModel, FlaubertWithLMHeadModel, FlaubertForSequenceClassification, FlaubertForQuestionAnswering, FlaubertForQuestionAnsweringSimple, FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_electra import ( ElectraForPreTraining, ElectraForMaskedLM, ElectraForTokenClassification, ElectraPreTrainedModel, ElectraForSequenceClassification, ElectraModel, load_tf_weights_in_electra, ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_reformer import ( ReformerAttention, ReformerLayer, ReformerModel, ReformerModelWithLMHead, REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_longformer import ( LongformerModel, LongformerForMaskedLM, LongformerForSequenceClassification, LongformerForMultipleChoice, LongformerForTokenClassification, LongformerForQuestionAnswering, LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, ) # Optimization from .optimization import ( AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup, ) # Trainer from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction from .data.data_collator import DefaultDataCollator, DataCollator, DataCollatorForLanguageModeling from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments # Benchmarks from .benchmark import PyTorchBenchmark, PyTorchBenchmarkArguments # TensorFlow if is_tf_available(): from .modeling_tf_utils import ( TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, tf_top_k_top_p_filtering, ) from .modeling_tf_auto import ( TFAutoModel, TFAutoModelForPreTraining, TFAutoModelForMultipleChoice, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_MODEL_MAPPING, TF_MODEL_FOR_PRETRAINING_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, ) from .modeling_tf_bert import ( TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings, TFBertModel, TFBertForPreTraining, TFBertForMaskedLM, TFBertForNextSentencePrediction, TFBertForSequenceClassification, TFBertForMultipleChoice, TFBertForTokenClassification, TFBertForQuestionAnswering, TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_gpt2 import ( TFGPT2PreTrainedModel, TFGPT2MainLayer, TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_openai import ( TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer, TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_transfo_xl import ( TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer, TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, TFAdaptiveEmbedding, ) from .modeling_tf_xlnet import ( TFXLNetPreTrainedModel, TFXLNetMainLayer, TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForTokenClassification, TFXLNetForQuestionAnsweringSimple, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_xlm import ( TFXLMPreTrainedModel, TFXLMMainLayer, TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_xlm_roberta import ( TFXLMRobertaForMaskedLM, TFXLMRobertaModel, TFXLMRobertaForSequenceClassification, TFXLMRobertaForTokenClassification, TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_roberta import ( TFRobertaPreTrainedModel, TFRobertaMainLayer, TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaForQuestionAnswering, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_camembert import ( TFCamembertModel, TFCamembertForMaskedLM, TFCamembertForSequenceClassification, TFCamembertForTokenClassification, TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_flaubert import ( TFFlaubertModel, TFFlaubertWithLMHeadModel, TFFlaubertForSequenceClassification, TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_distilbert import ( TFDistilBertPreTrainedModel, TFDistilBertMainLayer, TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TFDistilBertForQuestionAnswering, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_ctrl import ( TFCTRLPreTrainedModel, TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_albert import ( TFAlbertPreTrainedModel, TFAlbertMainLayer, TFAlbertModel, TFAlbertForPreTraining, TFAlbertForMaskedLM, TFAlbertForMultipleChoice, TFAlbertForSequenceClassification, TFAlbertForQuestionAnswering, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_t5 import ( TFT5PreTrainedModel, TFT5Model, TFT5ForConditionalGeneration, TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_tf_electra import ( TFElectraPreTrainedModel, TFElectraModel, TFElectraForPreTraining, TFElectraForMaskedLM, TFElectraForTokenClassification, TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, ) # Optimization from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator # Trainer from .trainer_tf import TFTrainer if not is_tf_available() and not is_torch_available(): logger.warning( "Neither PyTorch nor TensorFlow >= 2.0 have been found." "Models won't be available and only tokenizers, configuration" "and file/data utilities can be used." ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/__main__.py ================================================ # coding: utf8 def main(): import sys if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: print( "This command line utility let you convert original (author released) model checkpoint to pytorch.\n" "It should be used as one of: \n" ">> transformers1 bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" ">> transformers1 gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" ">> transformers1 transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" ">> transformers1 gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" ">> transformers1 xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" ">> transformers1 xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") else: if sys.argv[1] == "bert": try: from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) != 5: # pylint: disable=line-too-long print("Should be used as `transformers1 bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") else: PYTORCH_DUMP_OUTPUT = sys.argv.pop() TF_CONFIG = sys.argv.pop() TF_CHECKPOINT = sys.argv.pop() convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) elif sys.argv[1] == "gpt": from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch if len(sys.argv) < 4 or len(sys.argv) > 5: # pylint: disable=line-too-long print("Should be used as `transformers1 gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") else: OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] PYTORCH_DUMP_OUTPUT = sys.argv[3] if len(sys.argv) == 5: OPENAI_GPT_CONFIG = sys.argv[4] else: OPENAI_GPT_CONFIG = "" convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, OPENAI_GPT_CONFIG, PYTORCH_DUMP_OUTPUT) elif sys.argv[1] == "transfo_xl": try: from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) < 4 or len(sys.argv) > 5: # pylint: disable=line-too-long print("Should be used as `transformers1 transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") else: if 'ckpt' in sys.argv[2].lower(): TF_CHECKPOINT = sys.argv[2] TF_DATASET_FILE = "" else: TF_DATASET_FILE = sys.argv[2] TF_CHECKPOINT = "" PYTORCH_DUMP_OUTPUT = sys.argv[3] if len(sys.argv) == 5: TF_CONFIG = sys.argv[4] else: TF_CONFIG = "" convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) elif sys.argv[1] == "gpt2": try: from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) < 4 or len(sys.argv) > 5: # pylint: disable=line-too-long print("Should be used as `transformers1 gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") else: TF_CHECKPOINT = sys.argv[2] PYTORCH_DUMP_OUTPUT = sys.argv[3] if len(sys.argv) == 5: TF_CONFIG = sys.argv[4] else: TF_CONFIG = "" convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) elif sys.argv[1] == "xlnet": try: from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch except ImportError: print("transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions.") raise if len(sys.argv) < 5 or len(sys.argv) > 6: # pylint: disable=line-too-long print("Should be used as `transformers1 xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`") else: TF_CHECKPOINT = sys.argv[2] TF_CONFIG = sys.argv[3] PYTORCH_DUMP_OUTPUT = sys.argv[4] if len(sys.argv) == 6: FINETUNING_TASK = sys.argv[5] else: FINETUNING_TASK = None convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, FINETUNING_TASK) elif sys.argv[1] == "xlm": from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch if len(sys.argv) != 4: # pylint: disable=line-too-long print("Should be used as `transformers1 xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`") else: XLM_CHECKPOINT_PATH = sys.argv[2] PYTORCH_DUMP_OUTPUT = sys.argv[3] convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT) if __name__ == '__main__': main() ================================================ FILE: code/bert-base-count5/pretrain/transformers1/activations.py ================================================ import logging import math import torch import torch.nn.functional as F logger = logging.getLogger(__name__) def swish(x): return x * torch.sigmoid(x) def _gelu_python(x): """ Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in torch.nn.functional Also see https://arxiv.org/abs/1606.08415 """ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) def gelu_new(x): """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see https://arxiv.org/abs/1606.08415 """ return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) if torch.__version__ < "1.4.0": gelu = _gelu_python else: gelu = F.gelu def gelu_fast(x): return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) ACT2FN = { "relu": F.relu, "swish": swish, "gelu": gelu, "tanh": torch.tanh, "gelu_new": gelu_new, "gelu_fast": gelu_fast, } def get_activation(activation_string): if activation_string in ACT2FN: return ACT2FN[activation_string] else: raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/another_try.py ================================================ from transformers import TFBertModel, BertTokenizer, BertConfig import tensorflow as tf config = BertConfig.from_pretrained("bert-base-cased", output_hidden_states=True) model = TFBertModel.from_pretrained("bert-base-cased", config=config) tok = BertTokenizer.from_pretrained("bert-base-cased") text = tok.encode("Ain't this [MASK] best thing you've ever seen?") inputs = tf.constant(text) outputs = model.predict(inputs) print(outputs) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/benchmark/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from ..file_utils import is_torch_available if is_torch_available(): from .benchmark_args import PyTorchBenchmarkArguments from .benchmark import PyTorchBenchmark ================================================ FILE: code/bert-base-count5/pretrain/transformers1/benchmark/benchmark.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Benchmarking the library on inference and training in PyTorch. """ import inspect import logging import timeit from transformers import MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, PretrainedConfig, is_torch_available from .benchmark_utils import Benchmark, Memory, start_memory_tracing, stop_memory_tracing if is_torch_available(): import torch from .benchmark_args import PyTorchBenchmarkArguments logger = logging.getLogger(__name__) class PyTorchBenchmark(Benchmark): args: PyTorchBenchmarkArguments configs: PretrainedConfig framework: str = "PyTorch" @property def framework_version(self): return torch.__version__ def train(self, model_name, batch_size, sequence_length, trace_memory=False): try: config = self.config_dict[model_name] model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config) model.to(self.args.device) model.train() input_ids = torch.randint( model.config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device ) def compute_loss_and_backprob(): # TODO: Not all models call labels argument labels => this hack using the function signature should be corrected once all models have a common name for labels function_argument_names = inspect.getfullargspec(model.forward).args if "labels" in function_argument_names: loss = model(input_ids, labels=input_ids)[0] elif "lm_labels" in function_argument_names: loss = model(input_ids, lm_labels=input_ids)[0] elif "masked_lm_labels" in function_argument_names: loss = model(input_ids, masked_lm_labels=input_ids)[0] else: NotImplementedError(f"{model_name} does not seem to allow training with labels") loss.backward() model.zero_grad() if trace_memory is True: if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: trace = start_memory_tracing("transformers1") else: # clear cuda cache torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() # calculate loss and do backpropagation compute_loss_and_backprob() if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: summary = stop_memory_tracing(trace) memory = summary.total else: memory = Memory(torch.cuda.max_memory_reserved()) return memory else: # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average runtimes = timeit.repeat(lambda: compute_loss_and_backprob(), repeat=self.args.repeat, number=10,) return min(runtimes) / 10.0 except RuntimeError as e: self.print_fn("Doesn't fit on GPU. {}".format(e)) return "N/A" def inference(self, model_name, batch_size, sequence_length, trace_memory=False): try: config = self.config_dict[model_name] model = MODEL_MAPPING[config.__class__](config) model.to(self.args.device) model.eval() input_ids = torch.randint( config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device ) if trace_memory is True: if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: trace = start_memory_tracing("transformers1") else: # clear cuda cache torch.cuda.empty_cache() if hasattr(torch.cuda, "max_memory_reserved"): torch.cuda.reset_peak_memory_stats() else: logger.info( "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage" ) torch.cuda.reset_max_memory_cached() model(input_ids) if self.args.trace_memory_line_by_line or self.args.n_gpu == 0: summary = stop_memory_tracing(trace) memory = summary.total else: if hasattr(torch.cuda, "max_memory_reserved"): memory = Memory(torch.cuda.max_memory_reserved()) else: logger.info( "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage" ) memory = Memory(torch.cuda.max_memory_cached()) return memory else: # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average runtimes = timeit.repeat(lambda: model(input_ids), repeat=self.args.repeat, number=10,) return min(runtimes) / 10.0 except RuntimeError as e: self.print_fn("Doesn't fit on GPU. {}".format(e)) return "N/A" ================================================ FILE: code/bert-base-count5/pretrain/transformers1/benchmark/benchmark_args.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from dataclasses import dataclass, field from typing import Tuple from ..file_utils import cached_property, is_torch_available, torch_required from .benchmark_args_utils import BenchmarkArguments if is_torch_available(): import torch try: import torch_xla.core.xla_model as xm _has_tpu = True except ImportError: _has_tpu = False @torch_required def is_tpu_available(): return _has_tpu logger = logging.getLogger(__name__) @dataclass class PyTorchBenchmarkArguments(BenchmarkArguments): no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"}) torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"}) fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."}) @cached_property @torch_required def _setup_devices(self) -> Tuple["torch.device", int]: logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") n_gpu = 0 elif is_tpu_available(): device = xm.xla_device() n_gpu = 0 else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() return device, n_gpu @property @torch_required def device_idx(self) -> int: return torch.cuda.current_device() @property @torch_required def device(self) -> "torch.device": return self._setup_devices[0] @property @torch_required def n_gpu(self): return self._setup_devices[1] ================================================ FILE: code/bert-base-count5/pretrain/transformers1/benchmark/benchmark_args_utils.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import dataclasses import json from dataclasses import dataclass, field from time import time from typing import List def list_field(default=None, metadata=None): return field(default_factory=lambda: default, metadata=metadata) @dataclass class BenchmarkArguments: """ BenchMarkArguments are arguments we use in our benchmark scripts **which relate to the training loop itself**. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ models: List[str] = list_field( default=[], metadata={ "help": "Model checkpoints to be provided to the AutoModel classes. Leave blank to benchmark the base version of all available models" }, ) batch_sizes: List[int] = list_field( default=[8], metadata={"help": "List of batch sizes for which memory and time performance will be evaluated"} ) sequence_lengths: List[int] = list_field( default=[8, 32, 128, 512], metadata={"help": "List of sequence lengths for which memory and time performance will be evaluated"}, ) no_inference: bool = field(default=False, metadata={"help": "Don't benchmark inference of model"}) training: bool = field(default=False, metadata={"help": "Benchmark training of model"}) verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"}) no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"}) no_memory: bool = field(default=False, metadata={"help": "Don't perform memory measurments"}) trace_memory_line_by_line: bool = field(default=False, metadata={"help": "Trace memory line by line"}) save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"}) log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"}) no_env_print: bool = field(default=False, metadata={"help": "Don't print environment information"}) inference_time_csv_file: str = field( default=f"inference_time_{round(time())}.csv", metadata={"help": "CSV filename used if saving time results to csv."}, ) inference_memory_csv_file: str = field( default=f"inference_memory_{round(time())}.csv", metadata={"help": "CSV filename used if saving memory results to csv."}, ) train_time_csv_file: str = field( default=f"train_time_{round(time())}.csv", metadata={"help": "CSV filename used if saving time results to csv for training."}, ) train_memory_csv_file: str = field( default=f"train_memory_{round(time())}.csv", metadata={"help": "CSV filename used if saving memory results to csv for training."}, ) env_info_csv_file: str = field( default=f"env_info_{round(time())}.csv", metadata={"help": "CSV filename used if saving environment information."}, ) log_filename: str = field( default=f"log_{round(time())}.csv", metadata={"help": "Log filename used if print statements are saved in log."}, ) repeat: int = field(default=3, metadata={"help": "Times an experiment will be run."}) def to_json_string(self): """ Serializes this instance to a JSON string. """ return json.dumps(dataclasses.asdict(self), indent=2) @property def model_names(self): return self.models ================================================ FILE: code/bert-base-count5/pretrain/transformers1/benchmark/benchmark_utils.py ================================================ """ Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp Copyright by the AllenNLP authors. """ import copy import csv import linecache import logging import os import platform import sys from abc import ABC, abstractmethod from collections import defaultdict, namedtuple from datetime import datetime from typing import Iterable, List, NamedTuple, Optional, Union from transformers import AutoConfig, PretrainedConfig from transformers import __version__ as version from ..file_utils import is_tf_available, is_torch_available from .benchmark_args_utils import BenchmarkArguments if is_torch_available(): from torch.cuda import empty_cache as torch_empty_cache if is_tf_available(): from tensorflow.python.eager import context as tf_context logger = logging.getLogger(__name__) # pylint: disable=invalid-name _is_memory_tracing_enabled = False BenchmarkOutput = namedtuple( "BenchmarkOutput", ["time_inference_result", "memory_inference_result", "time_train_result", "memory_train_result"] ) def is_memory_tracing_enabled(): global _is_memory_tracing_enabled return _is_memory_tracing_enabled class Frame(NamedTuple): """ `Frame` is a NamedTuple used to gather the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ filename: str module: str line_number: int event: str line_text: str class UsedMemoryState(NamedTuple): """ `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) """ frame: Frame cpu_memory: int gpu_memory: int class Memory(NamedTuple): """ `Memory` NamedTuple have a single field `bytes` and you can get a human readable str of the number of mega bytes by calling `__repr__` - `byte` (integer): number of bytes, """ bytes: int def __repr__(self) -> str: return str(bytes_to_mega_bytes(self.bytes)) class MemoryState(NamedTuple): """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ frame: Frame cpu: Memory gpu: Memory cpu_gpu: Memory class MemorySummary(NamedTuple): """ `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). """ sequential: List[MemoryState] cumulative: List[MemoryState] current: List[MemoryState] total: Memory MemoryTrace = List[UsedMemoryState] def start_memory_tracing( modules_to_trace: Optional[Union[str, Iterable[str]]] = None, modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None, events_to_trace: str = "line", gpus_to_trace: Optional[List[int]] = None, ) -> MemoryTrace: """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. See `../../examples/benchmarks.py for a usage example. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident Set Size” (the non-swapped physical memory the process is using). See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info Args: - `modules_to_trace`: (None, string, list/tuple of string) if None, all events are recorded if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers1.modeling_gpt2') - `modules_not_to_trace`: (None, string, list/tuple of string) if None, no module is avoided if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch') - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events) default to line - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs Return: - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script). - `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log CPU memory usage. " "Install psutil (pip install psutil) to use CPU memory tracing." ) process = None else: process = psutil.Process(os.getpid()) try: from py3nvml import py3nvml py3nvml.nvmlInit() devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace py3nvml.nvmlShutdown() except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to use GPU memory tracing." ) log_gpu = False except (OSError, py3nvml.NVMLError): logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.") log_gpu = False else: log_gpu = is_torch_available() or is_tf_available() memory_trace = [] def traceit(frame, event, args): """ Tracing method executed before running each line in a module or sub-module Record memory allocated in a list with debugging information """ global _is_memory_tracing_enabled if not _is_memory_tracing_enabled: return traceit # Filter events if events_to_trace is not None: if isinstance(events_to_trace, str) and event != events_to_trace: return traceit elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace: return traceit # Filter modules name = frame.f_globals["__name__"] if not isinstance(name, str): return traceit else: # Filter whitelist of modules to trace if modules_to_trace is not None: if isinstance(modules_to_trace, str) and modules_to_trace not in name: return traceit elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace): return traceit # Filter blacklist of modules not to trace if modules_not_to_trace is not None: if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name: return traceit elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace): return traceit # Record current tracing state (file, location in file...) lineno = frame.f_lineno filename = frame.f_globals["__file__"] if filename.endswith(".pyc") or filename.endswith(".pyo"): filename = filename[:-1] line = linecache.getline(filename, lineno).rstrip() traced_state = Frame(filename, name, lineno, event, line) # Record current memory state (rss memory) and compute difference with previous memory state cpu_mem = 0 if process is not None: mem = process.memory_info() cpu_mem = mem.rss gpu_mem = 0 if log_gpu: # Clear GPU caches if is_torch_available(): torch_empty_cache() if is_tf_available(): tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802 # Sum used memory for all GPUs py3nvml.nvmlInit() for i in devices: handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used py3nvml.nvmlShutdown() mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem) memory_trace.append(mem_state) return traceit sys.settrace(traceit) global _is_memory_tracing_enabled _is_memory_tracing_enabled = True return memory_trace def stop_memory_tracing( memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True ) -> Optional[MemorySummary]: """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given. Args: - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory Return: - None if `memory_trace` is None - `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). `Memory` named tuple have fields - `byte` (integer): number of bytes, - `string` (string): same as human readable string (ex: "3.5MB") `Frame` are namedtuple used to list the current frame state and have the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ global _is_memory_tracing_enabled _is_memory_tracing_enabled = False if memory_trace is not None and len(memory_trace) > 1: memory_diff_trace = [] memory_curr_trace = [] cumulative_memory_dict = defaultdict(lambda: [0, 0, 0]) for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip( memory_trace[:-1], memory_trace[1:] ): cpu_mem_inc = next_cpu_mem - cpu_mem gpu_mem_inc = next_gpu_mem - gpu_mem cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc memory_diff_trace.append( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) ) memory_curr_trace.append( MemoryState( frame=frame, cpu=Memory(next_cpu_mem), gpu=Memory(next_gpu_mem), cpu_gpu=Memory(next_gpu_mem + next_cpu_mem), ) ) cumulative_memory_dict[frame][0] += cpu_mem_inc cumulative_memory_dict[frame][1] += gpu_mem_inc cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc cumulative_memory = sorted( list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True ) # order by the total CPU + GPU memory increase cumulative_memory = list( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory ) memory_curr_trace = sorted(memory_curr_trace, key=lambda x: x.cpu_gpu.bytes, reverse=True) if ignore_released_memory: total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace) else: total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace) total_memory = Memory(total_memory) return MemorySummary( sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory, ) return None def bytes_to_mega_bytes(memory_amount: int) -> int: """ Utility to convert a number of bytes (int) into a number of mega bytes (int) """ return memory_amount >> 20 class Benchmark(ABC): """ Benchmarks is a simple but feature-complete benchmarking script to compare memory and time performance of models in Transformers. """ args: BenchmarkArguments configs: PretrainedConfig framework: str def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig = None): self.args = args if configs is None: self.config_dict = { model_name: AutoConfig.from_pretrained(model_name) for model_name in self.args.model_names } else: self.config_dict = {model_name: config for model_name, config in zip(self.args.model_names, configs)} self._print_fn = None self._framework_version = None self._environment_info = None @property def print_fn(self): if self._print_fn is None: if self.args.log_print: logging.basicConfig( level=logging.DEBUG, filename=self.args.log_filename, filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s", ) def print_and_log(*args): logging.info(*args) print(*args) self._print_fn = print_and_log else: self._print_fn = print return self._print_fn @property def is_gpu(self): return self.args.n_gpu > 0 @property @abstractmethod def framework_version(self): pass @abstractmethod def train(self, model_name, batch_size, sequence_length): pass @abstractmethod def inference(self, model_name, batch_size, sequence_length): pass def run(self): result_dict = {model_name: {} for model_name in self.args.model_names} inference_result_time = copy.deepcopy(result_dict) inference_result_memory = copy.deepcopy(result_dict) train_result_time = copy.deepcopy(result_dict) train_result_memory = copy.deepcopy(result_dict) for c, model_name in enumerate(self.args.model_names): self.print_fn(f"{c + 1} / {len(self.args.model_names)}") model_dict = { "bs": self.args.batch_sizes, "ss": self.args.sequence_lengths, "result": {i: {} for i in self.args.batch_sizes}, } inference_result_time[model_name] = copy.deepcopy(model_dict) inference_result_memory[model_name] = copy.deepcopy(model_dict) train_result_time[model_name] = copy.deepcopy(model_dict) train_result_memory[model_name] = copy.deepcopy(model_dict) for batch_size in self.args.batch_sizes: for sequence_length in self.args.sequence_lengths: if not self.args.no_inference: if not self.args.no_memory: memory = self.inference(model_name, batch_size, sequence_length, trace_memory=True) inference_result_memory[model_name]["result"][batch_size][sequence_length] = memory if not self.args.no_speed: time = self.inference(model_name, batch_size, sequence_length, trace_memory=False) inference_result_time[model_name]["result"][batch_size][sequence_length] = time if self.args.training: if not self.args.no_memory: memory = self.train(model_name, batch_size, sequence_length, trace_memory=True) train_result_memory[model_name]["result"][batch_size][sequence_length] = memory if not self.args.no_speed: time = self.inference(model_name, batch_size, sequence_length, trace_memory=False) train_result_time[model_name]["result"][batch_size][sequence_length] = time if not self.args.no_inference: if not self.args.no_speed: self.print_fn("======= INFERENCE - SPEED - RESULT =======") self.print_results(inference_result_time) self.save_to_csv(inference_result_time, self.args.inference_time_csv_file) if not self.args.no_memory: self.print_fn("======= INFERENCE - MEMORY - RESULT =======") self.print_results(inference_result_memory) self.save_to_csv(inference_result_memory, self.args.inference_memory_csv_file) if self.args.training: if not self.args.no_speed: self.print_fn("======= TRAIN - SPEED - RESULT =======") self.print_results(train_result_time) self.save_to_csv(train_result_time, self.args.train_time_csv_file) if not self.args.no_memory: self.print_fn("======= TRAIN - MEMORY - RESULT =======") self.print_results(train_result_memory) self.save_to_csv(train_result_memory, self.args.train_memory_csv_file) if not self.args.no_env_print: self.print_fn("\n======== ENVIRONMENT - INFORMATION ========") self.print_fn( "\n".join(["- {}: {}".format(prop, val) for prop, val in self.environment_info.items()]) + "\n" ) if self.args.save_to_csv: with open(self.args.env_info_csv_file, mode="w", newline="") as csv_file: writer = csv.writer(csv_file) for key, value in self.environment_info.items(): writer.writerow([key, value]) return BenchmarkOutput(inference_result_time, inference_result_memory, train_result_time, train_result_memory) @property def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" else: info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total) info["use_gpu"] = self.is_gpu if self.is_gpu: info["num_gpus"] = self.args.n_gpu try: from py3nvml import py3nvml py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx) except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" except (OSError, py3nvml.NVMLError): logger.warning( "Error while initializing comunication with GPU. " "We won't log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" py3nvml.nvmlShutdown() else: info["gpu"] = py3nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes(py3nvml.nvmlDeviceGetMemoryInfo(handle).total) info["gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000 info["gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState(handle) py3nvml.nvmlShutdown() self._environment_info = info return self._environment_info def print_results(self, result_dict): for model_name in self.args.model_names: self.print_fn("\t" + f"======= MODEL CHECKPOINT: {model_name} =======") for batch_size in result_dict[model_name]["bs"]: for sequence_length in result_dict[model_name]["ss"]: result = result_dict[model_name]["result"][batch_size][sequence_length] if isinstance(result, float): self.print_fn( f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{(round(1000 * result) / 1000)}s" ) else: self.print_fn(f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{result} MB") def print_memory_trace_statistics(self, summary: MemorySummary): self.print_fn( "\nLine by line memory consumption:\n" + "\n".join( f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}" for state in summary.sequential ) ) self.print_fn( "\nLines with top memory consumption:\n" + "\n".join( f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}" for state in summary.cumulative[:6] ) ) self.print_fn( "\nLines with lowest memory consumption:\n" + "\n".join( f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}" for state in summary.cumulative[-6:] ) ) self.print_fn(f"\nTotal memory increase: {summary.total}") def save_to_csv(self, result_dict, filename): if not self.args.save_to_csv: return self.print_fn("Saving results to csv.") with open(filename, mode="w") as csv_file: assert len(self.args.model_names) > 0, "At least 1 model should be defined, but got {}".format( self.model_names ) fieldnames = ["model", "batch_size", "sequence_length"] writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["result"]) writer.writeheader() for model_name in self.args.model_names: result_dict_model = result_dict[model_name]["result"] for bs in result_dict_model: for ss in result_dict_model[bs]: result_model = result_dict_model[bs][ss] writer.writerow( { "model": model_name, "batch_size": bs, "sequence_length": ss, "result": ("{}" if not isinstance(result_model, float) else "{:.4f}").format( result_model ), } ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/benchmark_utils.py ================================================ """ Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp Copyright by the AllenNLP authors. """ import linecache import logging import os import sys from collections import defaultdict from typing import Iterable, List, NamedTuple, Optional, Union from .file_utils import is_tf_available, is_torch_available if is_torch_available(): from torch.cuda import empty_cache as torch_empty_cache if is_tf_available(): from tensorflow.python.eager import context as tf_context logger = logging.getLogger(__name__) # pylint: disable=invalid-name _is_memory_tracing_enabled = False def is_memory_tracing_enabled(): global _is_memory_tracing_enabled return _is_memory_tracing_enabled class Frame(NamedTuple): """ `Frame` is a NamedTuple used to gather the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ filename: str module: str line_number: int event: str line_text: str class UsedMemoryState(NamedTuple): """ `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) """ frame: Frame cpu_memory: int gpu_memory: int class Memory(NamedTuple): """ `Memory` NamedTuple have a single field `bytes` and you can get a human readable string of the number of bytes by calling `__repr__` - `byte` (integer): number of bytes, """ bytes: int def __repr__(self) -> str: return bytes_to_human_readable(self.bytes) class MemoryState(NamedTuple): """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ frame: Frame cpu: Memory gpu: Memory cpu_gpu: Memory class MemorySummary(NamedTuple): """ `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). """ sequential: List[MemoryState] cumulative: List[MemoryState] total: Memory MemoryTrace = List[UsedMemoryState] def start_memory_tracing( modules_to_trace: Optional[Union[str, Iterable[str]]] = None, modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None, events_to_trace: str = "line", gpus_to_trace: Optional[List[int]] = None, ) -> MemoryTrace: """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. See `../../examples/benchmarks.py for a usage example. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident Set Size” (the non-swapped physical memory the process is using). See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info Args: - `modules_to_trace`: (None, string, list/tuple of string) if None, all events are recorded if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers1.modeling_gpt2') - `modules_not_to_trace`: (None, string, list/tuple of string) if None, no module is avoided if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch') - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events) default to line - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs Return: - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script). - `UsedMemoryState` are named tuples with the following fields: - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file) - 'cpu_memory': CPU RSS memory state *before* executing the line - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided) `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state. `Frame` has the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script """ try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log CPU memory usage. " "Install psutil (pip install psutil) to use CPU memory tracing." ) process = None else: process = psutil.Process(os.getpid()) try: from py3nvml import py3nvml py3nvml.nvmlInit() devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace py3nvml.nvmlShutdown() except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to use GPU memory tracing." ) log_gpu = False except (OSError, py3nvml.NVMLError): logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.") log_gpu = False else: log_gpu = is_torch_available() or is_tf_available() memory_trace = [] def traceit(frame, event, args): """ Tracing method executed before running each line in a module or sub-module Record memory allocated in a list with debugging information """ global _is_memory_tracing_enabled if not _is_memory_tracing_enabled: return traceit # Filter events if events_to_trace is not None: if isinstance(events_to_trace, str) and event != events_to_trace: return traceit elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace: return traceit # Filter modules name = frame.f_globals["__name__"] if not isinstance(name, str): return traceit else: # Filter whitelist of modules to trace if modules_to_trace is not None: if isinstance(modules_to_trace, str) and modules_to_trace not in name: return traceit elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace): return traceit # Filter blacklist of modules not to trace if modules_not_to_trace is not None: if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name: return traceit elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace): return traceit # Record current tracing state (file, location in file...) lineno = frame.f_lineno filename = frame.f_globals["__file__"] if filename.endswith(".pyc") or filename.endswith(".pyo"): filename = filename[:-1] line = linecache.getline(filename, lineno).rstrip() traced_state = Frame(filename, name, lineno, event, line) # Record current memory state (rss memory) and compute difference with previous memory state cpu_mem = 0 if process is not None: mem = process.memory_info() cpu_mem = mem.rss gpu_mem = 0 if log_gpu: # Clear GPU caches if is_torch_available(): torch_empty_cache() if is_tf_available(): tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802 # Sum used memory for all GPUs py3nvml.nvmlInit() for i in devices: handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used py3nvml.nvmlShutdown() mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem) memory_trace.append(mem_state) return traceit sys.settrace(traceit) global _is_memory_tracing_enabled _is_memory_tracing_enabled = True return memory_trace def stop_memory_tracing( memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True ) -> Optional[MemorySummary]: """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given. Args: - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory Return: - None if `memory_trace` is None - `MemorySummary` namedtuple otherwise with the fields: - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line obtained by summing repeted memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). `Memory` named tuple have fields - `byte` (integer): number of bytes, - `string` (string): same as human readable string (ex: "3.5MB") `Frame` are namedtuple used to list the current frame state and have the following fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields: - `frame` (`Frame`): the current frame (see above) - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple """ global _is_memory_tracing_enabled _is_memory_tracing_enabled = False if memory_trace is not None and len(memory_trace) > 1: memory_diff_trace = [] cumulative_memory_dict = defaultdict(lambda: [0, 0, 0]) for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip( memory_trace[:-1], memory_trace[1:] ): cpu_mem_inc = next_cpu_mem - cpu_mem gpu_mem_inc = next_gpu_mem - gpu_mem cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc memory_diff_trace.append( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) ) cumulative_memory_dict[frame][0] += cpu_mem_inc cumulative_memory_dict[frame][1] += gpu_mem_inc cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc cumulative_memory = sorted( list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True ) # order by the total CPU + GPU memory increase cumulative_memory = list( MemoryState( frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc), ) for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory ) if ignore_released_memory: total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace) else: total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace) total_memory = Memory(total_memory) return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory) return None def bytes_to_human_readable(memory_amount: int) -> str: """ Utility to convert a number of bytes (int) in a human readable string (with units) """ for unit in ["B", "KB", "MB", "GB"]: if memory_amount > -1024.0 and memory_amount < 1024.0: return "{:.3f}{}".format(memory_amount, unit) memory_amount /= 1024.0 return "{:.3f}TB".format(memory_amount) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/__init__.py ================================================ from abc import ABC, abstractmethod from argparse import ArgumentParser class BaseTransformersCLICommand(ABC): @staticmethod @abstractmethod def register_subcommand(parser: ArgumentParser): raise NotImplementedError() @abstractmethod def run(self): raise NotImplementedError() ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/convert.py ================================================ from argparse import ArgumentParser, Namespace from logging import getLogger from transformers.commands import BaseTransformersCLICommand def convert_command_factory(args: Namespace): """ Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint. :return: ServeCommand """ return ConvertCommand( args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name ) class ConvertCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): """ Register this command to argparse so it's available for the transformer-cli :param parser: Root parser to register command-specific arguments :return: """ train_parser = parser.add_parser( "convert", help="CLI tool to run convert model from original " "author checkpoints to Transformers PyTorch checkpoints.", ) train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.") train_parser.add_argument( "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder." ) train_parser.add_argument( "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output." ) train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.") train_parser.add_argument( "--finetuning_task_name", type=str, default=None, help="Optional fine-tuning task name if the TF model was a finetuned model.", ) train_parser.set_defaults(func=convert_command_factory) def __init__( self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str, config: str, finetuning_task_name: str, *args ): self._logger = getLogger("transformers1-cli/converting") self._logger.info("Loading model {}".format(model_type)) self._model_type = model_type self._tf_checkpoint = tf_checkpoint self._pytorch_dump_output = pytorch_dump_output self._config = config self._finetuning_task_name = finetuning_task_name def run(self): if self._model_type == "albert": try: from transformers.convert_albert_original_tf_checkpoint_to_pytorch import ( convert_tf_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "bert": try: from transformers.convert_bert_original_tf_checkpoint_to_pytorch import ( convert_tf_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "gpt": from transformers.convert_openai_original_tf_checkpoint_to_pytorch import ( convert_openai_checkpoint_to_pytorch, ) convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "transfo_xl": try: from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import ( convert_transfo_xl_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) if "ckpt" in self._tf_checkpoint.lower(): TF_CHECKPOINT = self._tf_checkpoint TF_DATASET_FILE = "" else: TF_DATASET_FILE = self._tf_checkpoint TF_CHECKPOINT = "" convert_transfo_xl_checkpoint_to_pytorch( TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE ) elif self._model_type == "gpt2": try: from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import ( convert_gpt2_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "xlnet": try: from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import ( convert_xlnet_checkpoint_to_pytorch, ) except ImportError: msg = ( "transformers1 can only be used from the commandline to convert TensorFlow models in PyTorch, " "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise ImportError(msg) convert_xlnet_checkpoint_to_pytorch( self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name ) elif self._model_type == "xlm": from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import ( convert_xlm_checkpoint_to_pytorch, ) convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output) else: raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]") ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/download.py ================================================ from argparse import ArgumentParser from transformers.commands import BaseTransformersCLICommand def download_command_factory(args): return DownloadCommand(args.model, args.cache_dir, args.force) class DownloadCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): download_parser = parser.add_parser("download") download_parser.add_argument( "--cache-dir", type=str, default=None, help="Path to location to store the models" ) download_parser.add_argument( "--force", action="store_true", help="Force the model to be download even if already in cache-dir" ) download_parser.add_argument("model", type=str, help="Name of the model to download") download_parser.set_defaults(func=download_command_factory) def __init__(self, model: str, cache: str, force: bool): self._model = model self._cache = cache self._force = force def run(self): from transformers import AutoModel, AutoTokenizer AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/env.py ================================================ import platform from argparse import ArgumentParser from transformers import __version__ as version from transformers import is_tf_available, is_torch_available from transformers.commands import BaseTransformersCLICommand def info_command_factory(_): return EnvironmentCommand() class EnvironmentCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): download_parser = parser.add_parser("env") download_parser.set_defaults(func=info_command_factory) def run(self): pt_version = "not installed" pt_cuda_available = "NA" if is_torch_available(): import torch pt_version = torch.__version__ pt_cuda_available = torch.cuda.is_available() tf_version = "not installed" tf_cuda_available = "NA" if is_tf_available(): import tensorflow as tf tf_version = tf.__version__ try: # deprecated in v2.1 tf_cuda_available = tf.test.is_gpu_available() except AttributeError: # returns list of devices, convert to bool tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) info = { "`transformers1` version": version, "Platform": platform.platform(), "Python version": platform.python_version(), "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), "Using GPU in script?": "", "Using distributed or parallel set-up in script?": "", } print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") print(self.format_dict(info)) return info @staticmethod def format_dict(d): return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/run.py ================================================ import logging from argparse import ArgumentParser from transformers.commands import BaseTransformersCLICommand from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline logger = logging.getLogger(__name__) # pylint: disable=invalid-name def try_infer_format_from_ext(path: str): if not path: return "pipe" for ext in PipelineDataFormat.SUPPORTED_FORMATS: if path.endswith(ext): return ext raise Exception( "Unable to determine file format from file extension {}. " "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS) ) def run_command_factory(args): nlp = pipeline( task=args.task, model=args.model if args.model else None, config=args.config, tokenizer=args.tokenizer, device=args.device, ) format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format reader = PipelineDataFormat.from_str( format=format, output_path=args.output, input_path=args.input, column=args.column if args.column else nlp.default_input_names, overwrite=args.overwrite, ) return RunCommand(nlp, reader) class RunCommand(BaseTransformersCLICommand): def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): self._nlp = nlp self._reader = reader @staticmethod def register_subcommand(parser: ArgumentParser): run_parser = parser.add_parser("run", help="Run a pipeline through the CLI") run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run") run_parser.add_argument("--input", type=str, help="Path to the file to use for inference") run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.") run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.") run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.") run_parser.add_argument( "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)" ) run_parser.add_argument( "--column", type=str, help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)", ) run_parser.add_argument( "--format", type=str, default="infer", choices=PipelineDataFormat.SUPPORTED_FORMATS, help="Input format to read from", ) run_parser.add_argument( "--device", type=int, default=-1, help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", ) run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.") run_parser.set_defaults(func=run_command_factory) def run(self): nlp, outputs = self._nlp, [] for entry in self._reader: output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) if isinstance(output, dict): outputs.append(output) else: outputs += output # Saving data if self._nlp.binary_output: binary_path = self._reader.save_binary(outputs) logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path)) else: self._reader.save(outputs) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/serving.py ================================================ import logging from argparse import ArgumentParser, Namespace from typing import Any, List, Optional from transformers import Pipeline from transformers.commands import BaseTransformersCLICommand from transformers.pipelines import SUPPORTED_TASKS, pipeline try: from uvicorn import run from fastapi import FastAPI, HTTPException, Body from fastapi.routing import APIRoute from pydantic import BaseModel from starlette.responses import JSONResponse _serve_dependencies_installed = True except (ImportError, AttributeError): BaseModel = object def Body(*x, **y): pass _serve_dependencies_installed = False logger = logging.getLogger("transformers1-cli/serving") def serve_command_factory(args: Namespace): """ Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ nlp = pipeline( task=args.task, model=args.model if args.model else None, config=args.config, tokenizer=args.tokenizer, device=args.device, ) return ServeCommand(nlp, args.host, args.port, args.workers) class ServeModelInfoResult(BaseModel): """ Expose model information """ infos: dict class ServeTokenizeResult(BaseModel): """ Tokenize result model """ tokens: List[str] tokens_ids: Optional[List[int]] class ServeDeTokenizeResult(BaseModel): """ DeTokenize result model """ text: str class ServeForwardResult(BaseModel): """ Forward result model """ output: Any class ServeCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): """ Register this command to argparse so it's available for the transformer-cli :param parser: Root parser to register command-specific arguments :return: """ serve_parser = parser.add_parser( "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints." ) serve_parser.add_argument( "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on" ) serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.") serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.") serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers") serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.") serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.") serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.") serve_parser.add_argument( "--device", type=int, default=-1, help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", ) serve_parser.set_defaults(func=serve_command_factory) def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int): self._pipeline = pipeline self.host = host self.port = port self.workers = workers if not _serve_dependencies_installed: raise RuntimeError( "Using serve command requires FastAPI and unicorn. " 'Please install transformers1 with [serving]: pip install "transformers1[serving]".' "Or install FastAPI and unicorn separately." ) else: logger.info("Serving model over {}:{}".format(host, port)) self._app = FastAPI( routes=[ APIRoute( "/", self.model_info, response_model=ServeModelInfoResult, response_class=JSONResponse, methods=["GET"], ), APIRoute( "/tokenize", self.tokenize, response_model=ServeTokenizeResult, response_class=JSONResponse, methods=["POST"], ), APIRoute( "/detokenize", self.detokenize, response_model=ServeDeTokenizeResult, response_class=JSONResponse, methods=["POST"], ), APIRoute( "/forward", self.forward, response_model=ServeForwardResult, response_class=JSONResponse, methods=["POST"], ), ], timeout=600, ) def run(self): run(self._app, host=self.host, port=self.port, workers=self.workers) def model_info(self): return ServeModelInfoResult(infos=vars(self._pipeline.model.config)) def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)): """ Tokenize the provided input and eventually returns corresponding tokens id: - **text_input**: String to tokenize - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping. """ try: tokens_txt = self._pipeline.tokenizer.tokenize(text_input) if return_ids: tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt) return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids) else: return ServeTokenizeResult(tokens=tokens_txt) except Exception as e: raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) def detokenize( self, tokens_ids: List[int] = Body(None, embed=True), skip_special_tokens: bool = Body(False, embed=True), cleanup_tokenization_spaces: bool = Body(True, embed=True), ): """ Detokenize the provided tokens ids to readable text: - **tokens_ids**: List of tokens ids - **skip_special_tokens**: Flag indicating to not try to decode special tokens - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones. """ try: decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces) return ServeDeTokenizeResult(model="", text=decoded_str) except Exception as e: raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) async def forward(self, inputs=Body(None, embed=True)): """ **inputs**: **attention_mask**: **tokens_type_ids**: """ # Check we don't have empty string if len(inputs) == 0: return ServeForwardResult(output=[], attention=[]) try: # Forward through the model output = self._pipeline(inputs) return ServeForwardResult(output=output) except Exception as e: raise HTTPException(500, {"error": str(e)}) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/train.py ================================================ import os from argparse import ArgumentParser, Namespace from logging import getLogger from transformers import SingleSentenceClassificationProcessor as Processor from transformers import TextClassificationPipeline, is_tf_available, is_torch_available from transformers.commands import BaseTransformersCLICommand if not is_tf_available() and not is_torch_available(): raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") # TF training parameters USE_XLA = False USE_AMP = False def train_command_factory(args: Namespace): """ Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ return TrainCommand(args) class TrainCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): """ Register this command to argparse so it's available for the transformer-cli :param parser: Root parser to register command-specific arguments :return: """ train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.") train_parser.add_argument( "--train_data", type=str, required=True, help="path to train (and optionally evaluation) dataset as a csv with " "tab separated labels and sentences.", ) train_parser.add_argument( "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels." ) train_parser.add_argument( "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts." ) train_parser.add_argument( "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids." ) train_parser.add_argument( "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)." ) train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.") train_parser.add_argument( "--validation_split", type=float, default=0.1, help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.", ) train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.") train_parser.add_argument( "--task", type=str, default="text_classification", help="Task to train the model on." ) train_parser.add_argument( "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model." ) train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.") train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.") train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.") train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.") train_parser.set_defaults(func=train_command_factory) def __init__(self, args: Namespace): self.logger = getLogger("transformers1-cli/training") self.framework = "tf" if is_tf_available() else "torch" os.makedirs(args.output, exist_ok=True) assert os.path.isdir(args.output) self.output = args.output self.column_label = args.column_label self.column_text = args.column_text self.column_id = args.column_id self.logger.info("Loading {} pipeline for {}".format(args.task, args.model)) if args.task == "text_classification": self.pipeline = TextClassificationPipeline.from_pretrained(args.model) elif args.task == "token_classification": raise NotImplementedError elif args.task == "question_answering": raise NotImplementedError self.logger.info("Loading dataset from {}".format(args.train_data)) self.train_dataset = Processor.create_from_csv( args.train_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.valid_dataset = None if args.validation_data: self.logger.info("Loading validation dataset from {}".format(args.validation_data)) self.valid_dataset = Processor.create_from_csv( args.validation_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.validation_split = args.validation_split self.train_batch_size = args.train_batch_size self.valid_batch_size = args.valid_batch_size self.learning_rate = args.learning_rate self.adam_epsilon = args.adam_epsilon def run(self): if self.framework == "tf": return self.run_tf() return self.run_torch() def run_torch(self): raise NotImplementedError def run_tf(self): self.pipeline.fit( self.train_dataset, validation_data=self.valid_dataset, validation_split=self.validation_split, learning_rate=self.learning_rate, adam_epsilon=self.adam_epsilon, train_batch_size=self.train_batch_size, valid_batch_size=self.valid_batch_size, ) # Save trained pipeline self.pipeline.save_pretrained(self.output) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/transformers_cli.py ================================================ #!/usr/bin/env python from argparse import ArgumentParser from transformers.commands.convert import ConvertCommand from transformers.commands.download import DownloadCommand from transformers.commands.env import EnvironmentCommand from transformers.commands.run import RunCommand from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands def main(): parser = ArgumentParser("Transformers CLI tool", usage="transformers1-cli []") commands_parser = parser.add_subparsers(help="transformers1-cli command helpers") # Register commands ConvertCommand.register_subcommand(commands_parser) DownloadCommand.register_subcommand(commands_parser) EnvironmentCommand.register_subcommand(commands_parser) RunCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser) # Let's go args = parser.parse_args() if not hasattr(args, "func"): parser.print_help() exit(1) # Run service = args.func(args) service.run() if __name__ == "__main__": main() ================================================ FILE: code/bert-base-count5/pretrain/transformers1/commands/user.py ================================================ import os import sys from argparse import ArgumentParser from getpass import getpass from typing import List, Union from requests.exceptions import HTTPError from transformers.commands import BaseTransformersCLICommand from transformers.hf_api import HfApi, HfFolder UPLOAD_MAX_FILES = 15 class UserCommands(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): login_parser = parser.add_parser("login", help="Log in using the same credentials as on huggingface.co") login_parser.set_defaults(func=lambda args: LoginCommand(args)) whoami_parser = parser.add_parser("whoami", help="Find out which huggingface.co account you are logged in as.") whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args)) logout_parser = parser.add_parser("logout", help="Log out") logout_parser.set_defaults(func=lambda args: LogoutCommand(args)) # s3 s3_parser = parser.add_parser("s3", help="{ls, rm} Commands to interact with the files you upload on S3.") s3_subparsers = s3_parser.add_subparsers(help="s3 related commands") ls_parser = s3_subparsers.add_parser("ls") ls_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") ls_parser.set_defaults(func=lambda args: ListObjsCommand(args)) rm_parser = s3_subparsers.add_parser("rm") rm_parser.add_argument("filename", type=str, help="individual object filename to delete from S3.") rm_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") rm_parser.set_defaults(func=lambda args: DeleteObjCommand(args)) # upload upload_parser = parser.add_parser("upload", help="Upload a model to S3.") upload_parser.add_argument( "path", type=str, help="Local path of the model folder or individual file to upload." ) upload_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") upload_parser.add_argument( "--filename", type=str, default=None, help="Optional: override individual object filename on S3." ) upload_parser.set_defaults(func=lambda args: UploadCommand(args)) class ANSI: """ Helper for en.wikipedia.org/wiki/ANSI_escape_code """ _bold = "\u001b[1m" _red = "\u001b[31m" _reset = "\u001b[0m" @classmethod def bold(cls, s): return "{}{}{}".format(cls._bold, s, cls._reset) @classmethod def red(cls, s): return "{}{}{}".format(cls._bold + cls._red, s, cls._reset) class BaseUserCommand: def __init__(self, args): self.args = args self._api = HfApi() class LoginCommand(BaseUserCommand): def run(self): print( """ _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_| _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_| """ ) username = input("Username: ") password = getpass() try: token = self._api.login(username, password) except HTTPError as e: # probably invalid credentials, display error message. print(e) print(ANSI.red(e.response.text)) exit(1) HfFolder.save_token(token) print("Login successful") print("Your token:", token, "\n") print("Your token has been saved to", HfFolder.path_token) class WhoamiCommand(BaseUserCommand): def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit() try: user, orgs = self._api.whoami(token) print(user) if orgs: print(ANSI.bold("orgs: "), ",".join(orgs)) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) class LogoutCommand(BaseUserCommand): def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit() HfFolder.delete_token() self._api.logout(token) print("Successfully logged out.") class ListObjsCommand(BaseUserCommand): def tabulate(self, rows: List[List[Union[str, int]]], headers: List[str]) -> str: """ Inspired by: stackoverflow.com/a/8356620/593036 stackoverflow.com/questions/9535954/printing-lists-as-tabular-data """ col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)] row_format = ("{{:{}}} " * len(headers)).format(*col_widths) lines = [] lines.append(row_format.format(*headers)) lines.append(row_format.format(*["-" * w for w in col_widths])) for row in rows: lines.append(row_format.format(*row)) return "\n".join(lines) def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit(1) try: objs = self._api.list_objs(token, organization=self.args.organization) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) if len(objs) == 0: print("No shared file yet") exit() rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs] print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])) class DeleteObjCommand(BaseUserCommand): def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit(1) try: self._api.delete_obj(token, filename=self.args.filename, organization=self.args.organization) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) print("Done") class UploadCommand(BaseUserCommand): def walk_dir(self, rel_path): """ Recursively list all files in a folder. """ entries: List[os.DirEntry] = list(os.scandir(rel_path)) files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()] # (filepath, filename) for f in entries: if f.is_dir(): files += self.walk_dir(f.path) return files def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit(1) local_path = os.path.abspath(self.args.path) if os.path.isdir(local_path): if self.args.filename is not None: raise ValueError("Cannot specify a filename override when uploading a folder.") rel_path = os.path.basename(local_path) files = self.walk_dir(rel_path) elif os.path.isfile(local_path): filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path) files = [(local_path, filename)] else: raise ValueError("Not a valid file or directory: {}".format(local_path)) if sys.platform == "win32": files = [(filepath, filename.replace(os.sep, "/")) for filepath, filename in files] if len(files) > UPLOAD_MAX_FILES: print( "About to upload {} files to S3. This is probably wrong. Please filter files before uploading.".format( ANSI.bold(len(files)) ) ) exit(1) user, _ = self._api.whoami(token) namespace = self.args.organization if self.args.organization is not None else user for filepath, filename in files: print( "About to upload file {} to S3 under filename {} and namespace {}".format( ANSI.bold(filepath), ANSI.bold(filename), ANSI.bold(namespace) ) ) choice = input("Proceed? [Y/n] ").lower() if not (choice == "" or choice == "y" or choice == "yes"): print("Abort") exit() print(ANSI.bold("Uploading... This might take a while if files are large")) for filepath, filename in files: try: access_url = self._api.presign_and_upload( token=token, filename=filename, filepath=filepath, organization=self.args.organization ) except HTTPError as e: print(e) print(ANSI.red(e.response.text)) exit(1) print("Your file now lives at:") print(access_url) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_albert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ ALBERT model configuration """ from .configuration_utils import PretrainedConfig ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-config.json", "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-config.json", "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-config.json", "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-config.json", "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json", "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json", "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json", "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json", } class AlbertConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.AlbertModel`. It is used to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALBERT `xxlarge `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30000): Vocabulary size of the ALBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.AlbertModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of vocabulary embeddings. hidden_size (:obj:`int`, optional, defaults to 4096): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_hidden_groups (:obj:`int`, optional, defaults to 1): Number of groups for the hidden layers, parameters in the same group are shared. num_attention_heads (:obj:`int`, optional, defaults to 64): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 16384): The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. inner_group_num (:obj:`int`, optional, defaults to 1): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers1.AlbertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for attached classifiers. Example:: from transformers1 import AlbertConfig, AlbertModel # Initializing an ALBERT-xxlarge style configuration albert_xxlarge_configuration = AlbertConfig() # Initializing an ALBERT-base style configuration albert_base_configuration = AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) # Initializing a model from the ALBERT-base style configuration model = AlbertModel(albert_xxlarge_configuration) # Accessing the model configuration configuration = model.config """ model_type = "albert" def __init__( self, vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, num_hidden_groups=1, num_attention_heads=64, intermediate_size=16384, inner_group_num=1, hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, pad_token_id=0, bos_token_id=2, eos_token_id=3, **kwargs ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_hidden_groups = num_hidden_groups self.num_attention_heads = num_attention_heads self.inner_group_num = inner_group_num self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.classifier_dropout_prob = classifier_dropout_prob ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Config class. """ import logging from collections import OrderedDict from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig from .configuration_encoder_decoder import EncoderDecoderConfig from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig from .configuration_marian import MarianConfig from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig from .configuration_reformer import ReformerConfig from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig from .configuration_utils import PretrainedConfig from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig logger = logging.getLogger(__name__) ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( (key, value) for pretrained_map in [ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BART_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ] for key, value, in pretrained_map.items() ) CONFIG_MAPPING = OrderedDict( [ ("t5", T5Config,), ("distilbert", DistilBertConfig,), ("albert", AlbertConfig,), ("camembert", CamembertConfig,), ("xlm-roberta", XLMRobertaConfig,), ("marian", MarianConfig,), ("bart", BartConfig,), ("reformer", ReformerConfig,), ("longformer", LongformerConfig,), ("roberta", RobertaConfig,), ("flaubert", FlaubertConfig,), ("bert", BertConfig,), ("openai-gpt", OpenAIGPTConfig,), ("gpt2", GPT2Config,), ("transfo-xl", TransfoXLConfig,), ("xlnet", XLNetConfig,), ("xlm", XLMConfig,), ("ctrl", CTRLConfig,), ("electra", ElectraConfig,), ("encoder-decoder", EncoderDecoderConfig,), ] ) class AutoConfig: r""" :class:`~transformers1.AutoConfig` is a generic configuration class that will be instantiated as one of the configuration classes of the library when created with the :func:`~transformers1.AutoConfig.from_pretrained` class method. The :func:`~transformers1.AutoConfig.from_pretrained` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string. """ def __init__(self): raise EnvironmentError( "AutoConfig is designed to be instantiated " "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method." ) @classmethod def for_model(cls, model_type: str, *args, **kwargs): if model_type in CONFIG_MAPPING: config_class = CONFIG_MAPPING[model_type] return config_class(*args, **kwargs) raise ValueError( "Unrecognized model identifier: {}. Should contain one of {}".format( model_type, ", ".join(CONFIG_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiates one of the configuration classes of the library from a pre-trained model configuration. The configuration class to instantiate is selected based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5Config` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertConfig` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertConfig` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertConfig` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaConfig` (XLM-RoBERTa model) - `longformer`: :class:`~transformers1.LongformerConfig` (Longformer model) - `roberta`: :class:`~transformers1.RobertaConfig` (RoBERTa model) - `reformer`: :class:`~transformers1.ReformerConfig` (Reformer model) - `bert`: :class:`~transformers1.BertConfig` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTConfig` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2Config` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLConfig` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetConfig` (XLNet model) - `xlm`: :class:`~transformers1.XLMConfig` (XLM model) - `ctrl` : :class:`~transformers1.CTRLConfig` (CTRL model) - `flaubert` : :class:`~transformers1.FlaubertConfig` (Flaubert model) - `electra` : :class:`~transformers1.ElectraConfig` (ELECTRA model) Args: pretrained_model_name_or_path (:obj:`string`): Is either: \ - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers1.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. cache_dir (:obj:`string`, optional, defaults to `None`): Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download (:obj:`boolean`, optional, defaults to `False`): Force to (re-)download the model weights and configuration files and override the cached versions if they exist. resume_download (:obj:`boolean`, optional, defaults to `False`): Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies (:obj:`Dict[str, str]`, optional, defaults to `None`): A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. See `the requests documentation `__ for usage. return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`): - If False, then this function returns just the final configuration object. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading. - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. Examples:: config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) assert config.output_attention == True config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True) assert config.output_attention == True assert unused_kwargs == {'foo': False} """ config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) if "model_type" in config_dict: config_class = CONFIG_MAPPING[config_dict["model_type"]] return config_class.from_dict(config_dict, **kwargs) else: # Fallback: use pattern matching on the string. for pattern, config_class in CONFIG_MAPPING.items(): if pattern in pretrained_model_name_or_path: return config_class.from_dict(config_dict, **kwargs) raise ValueError( "Unrecognized model in {}. " "Should have a `model_type` key in its config.json, or contain one of the following strings " "in its name: {}".format(pretrained_model_name_or_path, ", ".join(CONFIG_MAPPING.keys())) ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_bart.py ================================================ # coding=utf-8 # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ BART configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) BART_PRETRAINED_CONFIG_ARCHIVE_MAP = { "facebook/bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json", "facebook/bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json", "facebook/bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json", "facebook/bart-large-xsum": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-xsum/config.json", "facebook/mbart-large-en-ro": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/config.json", } class BartConfig(PretrainedConfig): r""" Configuration class for Bart. Parameters are renamed from the fairseq implementation """ model_type = "bart" def __init__( self, activation_dropout=0.0, activation_function="gelu", vocab_size=50265, d_model=1024, encoder_ffn_dim=4096, encoder_layers=12, encoder_attention_heads=16, decoder_ffn_dim=4096, decoder_layers=12, decoder_attention_heads=16, encoder_layerdrop=0.0, decoder_layerdrop=0.0, attention_dropout=0.0, dropout=0.1, max_position_embeddings=1024, init_std=0.02, classifier_dropout=0.0, num_labels=3, is_encoder_decoder=True, pad_token_id=1, bos_token_id=0, eos_token_id=2, normalize_before=False, add_final_layer_norm=False, scale_embedding=False, normalize_embedding=True, static_position_embeddings=False, add_bias_logits=False, **common_kwargs ): r""" :class:`~transformers1.BartConfig` is the configuration class for `BartModel`. Examples: config = BartConfig.from_pretrained('bart-large') model = BartModel(config) """ if "hidden_size" in common_kwargs: raise ValueError("hidden size is called d_model") super().__init__( num_labels=num_labels, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **common_kwargs, ) self.vocab_size = vocab_size self.d_model = d_model # encoder_embed_dim and decoder_embed_dim self.encoder_ffn_dim = encoder_ffn_dim self.encoder_layers = self.num_hidden_layers = encoder_layers self.encoder_attention_heads = encoder_attention_heads self.encoder_layerdrop = encoder_layerdrop self.decoder_layerdrop = decoder_layerdrop self.decoder_ffn_dim = decoder_ffn_dim self.decoder_layers = decoder_layers self.decoder_attention_heads = decoder_attention_heads self.max_position_embeddings = max_position_embeddings self.init_std = init_std # Normal(0, this parameter) self.activation_function = activation_function # Params introduced for Mbart self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.normalize_embedding = normalize_embedding # True for mbart, False otherwise self.normalize_before = normalize_before # combo of fairseq's encoder_ and decoder_normalize_before self.add_final_layer_norm = add_final_layer_norm # Params introduced for Marian self.add_bias_logits = add_bias_logits self.static_position_embeddings = static_position_embeddings # 3 Types of Dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.dropout = dropout # Classifier stuff self.classif_dropout = classifier_dropout @property def num_attention_heads(self) -> int: return self.encoder_attention_heads @property def hidden_size(self) -> int: return self.d_model def is_valid_mbart(self) -> bool: """Is the configuration aligned with the MBART paper.""" if self.normalize_before and self.add_final_layer_norm and self.scale_embedding: return True if self.normalize_before or self.add_final_layer_norm or self.scale_embedding: logger.info("This configuration is a mixture of MBART and BART settings") return False ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ BERT model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", "cl-tohoku/bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/config.json", "cl-tohoku/bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/config.json", "cl-tohoku/bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/config.json", "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/config.json", "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json", "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json", "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json", # See all BERT models at https://huggingface.co/models?filter=bert } class BertConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.BertModel`. It is used to instantiate an BERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the BERT `bert-base-uncased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30522): Vocabulary size of the BERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.BertModel`. hidden_size (:obj:`int`, optional, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 3072): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers1.BertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. Example:: from transformers1 import BertModel, BertConfig # Initializing a BERT bert-base-uncased style configuration configuration = BertConfig() # Initializing a model from the bert-base-uncased style configuration model = BertModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "bert" def __init__( self, vocab_size=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, **kwargs ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_camembert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ CamemBERT configuration """ import logging from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", } class CamembertConfig(RobertaConfig): """ This class overrides :class:`~transformers1.RobertaConfig`. Please check the superclass for the appropriate documentation alongside usage examples. """ model_type = "camembert" ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Salesforce CTRL configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-config.json"} class CTRLConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.CTRLModel`. It is used to instantiate an CTRL model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `ctrl `__ architecture from SalesForce. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 246534): Vocabulary size of the CTRL model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.CTRLModel`. n_positions (:obj:`int`, optional, defaults to 256): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). n_ctx (:obj:`int`, optional, defaults to 256): Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, optional, defaults to 1280): Dimensionality of the embeddings and hidden states. dff (:obj:`int`, optional, defaults to 8192): Dimensionality of the inner dimension of the FFN. n_layer (:obj:`int`, optional, defaults to 48): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. resid_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, optional, defaults to 0.1): The dropout ratio for the embeddings. attn_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention. layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): The epsilon to use in the layer normalization layers initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Example:: from transformers1 import CTRLModel, CTRLConfig # Initializing a CTRL configuration configuration = CTRLConfig() # Initializing a model from the configuration model = CTRLModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "ctrl" def __init__( self, vocab_size=246534, n_positions=256, n_ctx=256, n_embd=1280, dff=8192, n_layer=48, n_head=16, resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-6, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, **kwargs ): super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.dff = dff self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.n_embd @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ DistilBERT model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json", "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-config.json", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", } class DistilBertConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.DistilBertModel`. It is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT `distilbert-base-uncased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30522): Vocabulary size of the DistilBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.BertModel`. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use sinusoidal positional embeddings. n_layers (:obj:`int`, optional, defaults to 6): Number of hidden layers in the Transformer encoder. n_heads (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. dim (:obj:`int`, optional, defaults to 768): Dimensionality of the encoder layers and the pooler layer. hidden_dim (:obj:`int`, optional, defaults to 3072): The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities. activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. qa_dropout (:obj:`float`, optional, defaults to 0.1): The dropout probabilities used in the question answering model :class:`~transformers1.DistilBertForQuestionAnswering`. seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): The dropout probabilities used in the sequence classification model :class:`~transformers1.DistilBertForSequenceClassification`. Example:: from transformers1 import DistilBertModel, DistilBertConfig # Initializing a DistilBERT configuration configuration = DistilBertConfig() # Initializing a model from the configuration model = DistilBertModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "distilbert" def __init__( self, vocab_size=30522, max_position_embeddings=512, sinusoidal_pos_embds=False, n_layers=6, n_heads=12, dim=768, hidden_dim=4 * 768, dropout=0.1, attention_dropout=0.1, activation="gelu", initializer_range=0.02, qa_dropout=0.1, seq_classif_dropout=0.2, pad_token_id=0, **kwargs ): super().__init__(**kwargs, pad_token_id=pad_token_id) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.sinusoidal_pos_embds = sinusoidal_pos_embds self.n_layers = n_layers self.n_heads = n_heads self.dim = dim self.hidden_dim = hidden_dim self.dropout = dropout self.attention_dropout = attention_dropout self.activation = activation self.initializer_range = initializer_range self.qa_dropout = qa_dropout self.seq_classif_dropout = seq_classif_dropout @property def hidden_size(self): return self.dim @property def num_attention_heads(self): return self.n_heads @property def num_hidden_layers(self): return self.n_layers ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_electra.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ ELECTRA model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = { "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json", "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json", "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json", "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json", "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json", "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json", } class ElectraConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.ElectraModel`. It is used to instantiate an ELECTRA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ELECTRA `google/electra-small-discriminator `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30522): Vocabulary size of the ELECTRA model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.ElectraModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of the encoder layers and the pooler layer. hidden_size (:obj:`int`, optional, defaults to 256): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (:obj:`int`, optional, defaults to 4): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 1024): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers1.ElectraModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. Example:: from transformers1 import ElectraModel, ElectraConfig # Initializing a ELECTRA electra-base-uncased style configuration configuration = ElectraConfig() # Initializing a model from the electra-base-uncased style configuration model = ElectraModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "electra" def __init__( self, vocab_size=30522, embedding_size=128, hidden_size=256, num_hidden_layers=12, num_attention_heads=4, intermediate_size=1024, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, **kwargs ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_encoder_decoder.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) class EncoderDecoderConfig(PretrainedConfig): r""" :class:`~transformers1.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`. It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. See the documentation for :class:`~transformers1.PretrainedConfig` for more information. Args: kwargs (`optional`): Remaining dictionary of keyword arguments. Notably: encoder (:class:`PretrainedConfig`, optional, defaults to `None`): An instance of a configuration object that defines the encoder config. encoder (:class:`PretrainedConfig`, optional, defaults to `None`): An instance of a configuration object that defines the decoder config. Example:: from transformers1 import BertConfig, EncoderDecoderConfig, EncoderDecoderModel # Initializing a BERT bert-base-uncased style configuration config_encoder = BertConfig() config_decoder = BertConfig() config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) # Initializing a Bert2Bert model from the bert-base-uncased style configurations model = EncoderDecoderModel(config=config) # Accessing the model configuration config_encoder = model.config.encoder config_decoder = model.config.decoder """ model_type = "encoder_decoder" def __init__(self, **kwargs): super().__init__(**kwargs) assert ( "encoder" in kwargs and "decoder" in kwargs ), "Config has to be initialized with encoder and decoder config" encoder_config = kwargs.pop("encoder") encoder_model_type = encoder_config.pop("model_type") decoder_config = kwargs.pop("decoder") decoder_model_type = decoder_config.pop("model_type") from transformers import AutoConfig self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) self.is_encoder_decoder = True @classmethod def from_encoder_decoder_configs( cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig ) -> PretrainedConfig: r""" Instantiate a :class:`~transformers1.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model configuration and decoder model configuration. Returns: :class:`EncoderDecoderConfig`: An instance of a configuration object """ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict()) def to_dict(self): """ Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`. Returns: :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) output["encoder"] = self.encoder.to_dict() output["decoder"] = self.decoder.to_dict() output["model_type"] = self.__class__.model_type return output ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Flaubert configuration, based on XLM. """ import logging from .configuration_xlm import XLMConfig logger = logging.getLogger(__name__) FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/config.json", "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/config.json", "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/config.json", "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/config.json", } class FlaubertConfig(XLMConfig): """ Configuration class to store the configuration of a `FlaubertModel`. This is the configuration class to store the configuration of a :class:`~transformers1.XLMModel`. It is used to instantiate an XLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `xlm-mlm-en-2048 `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to apply the layer normalization before or after the feed forward layer following the attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018) layerdrop (:obj:`float`, `optional`, defaults to 0.0): Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with Structured Dropout. ICLR 2020) vocab_size (:obj:`int`, optional, defaults to 30145): Vocabulary size of the Flaubert model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.FlaubertModel`. emb_dim (:obj:`int`, optional, defaults to 2048): Dimensionality of the encoder layers and the pooler layer. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for the attention mechanism gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): The non-linear activation function (function or string) in the encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. causal (:obj:`boolean`, optional, defaults to :obj:`False`): Set this to `True` for the model to behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context. asm (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer. n_langs (:obj:`int`, optional, defaults to 1): The number of languages the model handles. Set to 1 for monolingual models. use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual models page `__ for information on how to use them. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): The standard deviation of the truncated_normal_initializer for initializing the embedding matrices. init_std (:obj:`int`, optional, defaults to 50257): The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. bos_index (:obj:`int`, optional, defaults to 0): The index of the beginning of sentence token in the vocabulary. eos_index (:obj:`int`, optional, defaults to 1): The index of the end of sentence token in the vocabulary. pad_index (:obj:`int`, optional, defaults to 2): The index of the padding token in the vocabulary. unk_index (:obj:`int`, optional, defaults to 3): The index of the unknown token in the vocabulary. mask_index (:obj:`int`, optional, defaults to 5): The index of the masking token in the vocabulary. is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. summary_type (:obj:`string`, optional, defaults to "first"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a dropout before the projection and activation start_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. end_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. mask_token_id (:obj:`int`, optional, defaults to 0): Model agnostic parameter to identify masked tokens when generating text in an MLM context. lang_id (:obj:`int`, optional, defaults to 1): The ID of the language used by the model. This parameter is used when generating text in a given language. """ model_type = "flaubert" def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs): """Constructs FlaubertConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) self.layerdrop = layerdrop self.pre_norm = pre_norm ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ OpenAI GPT-2 configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json", "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json", "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json", } class GPT2Config(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.GPT2Model`. It is used to instantiate an GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the GPT-2 `small `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 50257): Vocabulary size of the GPT-2 model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.GPT2Model`. n_positions (:obj:`int`, optional, defaults to 1024): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). n_ctx (:obj:`int`, optional, defaults to 1024): Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, optional, defaults to 768): Dimensionality of the embeddings and hidden states. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. activation_function (:obj:`str`, optional, defaults to 'gelu'): Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"]. resid_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, optional, defaults to 0.1): The dropout ratio for the embeddings. attn_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention. layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): The epsilon to use in the layer normalization layers initializer_range (:obj:`float`, optional, defaults to 16): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. summary_type (:obj:`string`, optional, defaults to "cls_index"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.GPT2DoubleHeadsModel`. Add a dropout before the projection and activation Example:: from transformers1 import GPT2Model, GPT2Config # Initializing a GPT2 configuration configuration = GPT2Config() # Initializing a model from the configuration model = GPT2Model(configuration) # Accessing the model configuration configuration = model.config """ model_type = "gpt2" def __init__( self, vocab_size=50257, n_positions=1024, n_ctx=1024, n_embd=768, n_layer=12, n_head=12, activation_function="gelu_new", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, bos_token_id=50256, eos_token_id=50256, **kwargs ): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.n_embd @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_longformer.py ================================================ # coding=utf-8 # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Longformer configuration """ import logging from typing import List, Union from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { "allenai/longformer-base-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096/config.json", "allenai/longformer-large-4096": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096/config.json", "allenai/longformer-large-4096-finetuned-triviaqa": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-finetuned-triviaqa/config.json", "allenai/longformer-base-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-base-4096-extra.pos.embd.only/config.json", "allenai/longformer-large-4096-extra.pos.embd.only": "https://s3.amazonaws.com/models.huggingface.co/bert/allenai/longformer-large-4096-extra.pos.embd.only/config.json", } class LongformerConfig(RobertaConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.LongformerModel`. It is used to instantiate an Longformer model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa `roberta-base `__ architecture with a sequence length 4,096. The :class:`~transformers1.LongformerConfig` class directly inherits :class:`~transformers1.RobertaConfig`. It reuses the same defaults. Please check the parent class for more information. Args: attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512): Size of an attention window around each token. If :obj:`int`, use the same size for all layers. To specify a different window size for each layer, use a :obj:`List[int]` where ``len(attention_window) == num_hidden_layers``. Example:: from transformers1 import LongformerConfig, LongformerModel # Initializing a Longformer configuration configuration = LongformerConfig() # Initializing a model from the configuration model = LongformerModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "longformer" def __init__(self, attention_window: Union[List[int], int] = 512, sep_token_id: int = 2, **kwargs): super().__init__(**kwargs) self.attention_window = attention_window self.sep_token_id = sep_token_id ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_marian.py ================================================ # coding=utf-8 # Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Marian model configuration """ from .configuration_bart import BartConfig PRETRAINED_CONFIG_ARCHIVE_MAP = { "Helsinki-NLP/opus-mt-en-de": "https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/config.json", } class MarianConfig(BartConfig): model_type = "marian" ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_mmbt.py ================================================ # coding=utf-8 # Copyright (c) Facebook, Inc. and its affiliates. # Copyright (c) HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ MMBT configuration """ import logging logger = logging.getLogger(__name__) class MMBTConfig(object): """Configuration class to store the configuration of a `MMBT Model`. Args: config (:obj:`~transformers1.PreTrainedConfig`): Config of the underlying Transformer models. Its values are copied over to use a single config. num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): Size of final Linear layer for classification. modal_hidden_size (:obj:`int`, optional, defautls to 2048): Embedding dimension of the non-text modality encoder. """ def __init__(self, config, num_labels=None, modal_hidden_size=2048): self.__dict__ = config.__dict__ self.modal_hidden_size = modal_hidden_size if num_labels: self.num_labels = num_labels ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_openai.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ OpenAI GPT configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" } class OpenAIGPTConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.OpenAIGPTModel`. It is used to instantiate an GPT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `GPT `__ architecture from OpenAI. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 40478): Vocabulary size of the GPT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.CTRLModel`. n_positions (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). n_ctx (:obj:`int`, optional, defaults to 512): Dimensionality of the causal mask (usually same as n_positions). n_embd (:obj:`int`, optional, defaults to 768): Dimensionality of the embeddings and hidden states. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. resid_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, optional, defaults to 0.1): The dropout ratio for the embeddings. attn_pdrop (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention. layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): The epsilon to use in the layer normalization layers initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`): Whether special tokens should be predicted when the model is has a language modeling head. summary_type (:obj:`string`, optional, defaults to "cls_index"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.OpenAIGPTDoubleHeadsModel`. Add a dropout before the projection and activation Example:: from transformers1 import OpenAIGPTConfig, OpenAIGPTModel # Initializing a GPT configuration configuration = OpenAIGPTConfig() # Initializing a model from the configuration model = OpenAIGPTModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "openai-gpt" def __init__( self, vocab_size=40478, n_positions=512, n_ctx=512, n_embd=768, n_layer=12, n_head=12, afn="gelu", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, predict_special_tokens=True, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, **kwargs ): super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.afn = afn self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.predict_special_tokens = predict_special_tokens self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.n_embd @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_reformer.py ================================================ # coding=utf-8 # Copyright 2020 The Trax Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Reformer model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/config.json", "google/reformer-enwik8": "https://cdn.huggingface.co/google/reformer-enwik8/config.json", } class ReformerConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.ReformerModel`. It is used to instantiate an Reformer model according to the specified arguments, defining the model architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: attention_head_size (:obj:`int`, optional, defaults to 64): Dimensionality of the projected key, query and value vectors attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]): List of attention layer types in ascending order. It can be chosen between a LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local"). For more information on LSHSelfAttention layer, see `LSH Self Attention `__ . For more information on LocalSelfAttention layer, see `Local Self Attention `__ . axial_pos_embds (:obj:`bool`, optional, defaults to True): If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings `__ axial_norm_std (:obj:`float`, optional, defaluts to 1.0): The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings. axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`): The position dims of the axial position encodings. During training the product of the position dims has to equal the sequence length. For more information on how axial position embeddings work, see `Axial Position Encodings `__. axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`): The embedding dims of the axial position encodings. The sum of the embedding dims has to equal the hidden size. For more information on how axial position embeddings work, see `Axial Position Encodings `__. chunk_size_lm_head (:obj:`int`, optional, defaults to 0): The chunk size of the final language model feed forward head layer. A chunk size of 0 means that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time. For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ . chunk_size_feed_forward (:obj:`int`, optional, defaults to 0): The chunk size of all feed forward layers in the residual attention blocks. A chunk size of 0 means that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time. For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ . eos_token_id (:obj:`int`, optional, defaults to 2): The token id for the token. feed_forward_size (:obj:`int`, optional, defaults to 512): Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block. hash_seed (:obj:`int`, optional, defaults to `None`): Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"): The non-linear activation function (function or string) in the feed forward layer in the residual attention block. If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. hidden_size (:obj:`int`, optional, defaults to 256): Dimensionality of the output hidden states of the residual attention blocks. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. is_decoder (:obj:`bool`, optional, defaults to False): If `is_decoder` is True, a causal mask is used in addition to `attention_mask`. When using the Reformer for causal language modeling, `is_decoder` is set to `True`. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. local_chunk_length (:obj:`int`, optional, defaults to 64): Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention). local_num_chunks_before (:obj:`int`, optional, defaults to 1): Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself. local_num_chunks_after (:obj:`int`, optional, defaults to 0): Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself. local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities in LocalSelfAttention. lsh_chunk_length (:obj:`int`, optional, defaults to 64): Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention). lsh_num_chunks_before (:obj:`int`, optional, defaults to 1): Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself. lsh_num_chunks_after (:obj:`int`, optional, defaults to 0): Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself. lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for the attention probabilities in LSHSelfAttention. max_position_embeddings (:obj:`int`, optional, defaults to 4096): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). num_attention_heads (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`): Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`. The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors. The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly. num_hashes (:obj:`int`, optional, defaults to 1): Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme. The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes. pad_token_id (:obj:`int`, optional, defaults to 0): The token id for the token. vocab_size (:obj:`int`, optional, defaults to 320): Vocabulary size of the Reformer model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.ReformerModel`. Example:: from transformers1 import ReformerModel, ReformerConfig # Initializing a Reformer configuration configuration = ReformerConfig() # Initializing a Reformer model model = ReformerModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "reformer" def __init__( self, attention_head_size=64, attn_layers=["local", "lsh", "local", "lsh", "local", "lsh"], axial_norm_std=1.0, axial_pos_embds=True, axial_pos_shape=[64, 64], axial_pos_embds_dim=[64, 192], chunk_size_lm_head=0, chunk_size_feed_forward=0, eos_token_id=2, feed_forward_size=512, hash_seed=None, hidden_act="relu", hidden_dropout_prob=0.05, hidden_size=256, initializer_range=0.02, is_decoder=False, layer_norm_eps=1e-12, local_num_chunks_before=1, local_num_chunks_after=0, local_attention_probs_dropout_prob=0.05, local_attn_chunk_length=64, lsh_attn_chunk_length=64, lsh_attention_probs_dropout_prob=0.0, lsh_num_chunks_before=1, lsh_num_chunks_after=0, max_position_embeddings=4096, num_attention_heads=2, num_buckets=None, num_hashes=1, pad_token_id=0, vocab_size=320, **kwargs ): super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_decoder=is_decoder, **kwargs) self.hash_seed = hash_seed self.vocab_size = vocab_size self.attention_head_size = attention_head_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.num_hashes = num_hashes self.num_hidden_layers = len(attn_layers) self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets self.lsh_attn_chunk_length = lsh_attn_chunk_length self.local_attn_chunk_length = local_attn_chunk_length self.lsh_num_chunks_after = lsh_num_chunks_after self.lsh_num_chunks_before = lsh_num_chunks_before self.local_num_chunks_after = local_num_chunks_after self.local_num_chunks_before = local_num_chunks_before self.hidden_act = hidden_act self.feed_forward_size = feed_forward_size self.hidden_dropout_prob = hidden_dropout_prob self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.axial_pos_embds = axial_pos_embds self.axial_pos_shape = tuple(axial_pos_shape) self.axial_pos_embds_dim = tuple(axial_pos_embds_dim) self.axial_norm_std = axial_norm_std self.chunk_size_lm_head = chunk_size_lm_head self.chunk_size_feed_forward = chunk_size_feed_forward self.attn_layers = attn_layers ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ RoBERTa configuration """ import logging from .configuration_bert import BertConfig logger = logging.getLogger(__name__) ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", } class RobertaConfig(BertConfig): r""" This is the configuration class to store the configuration of a :class:`~transformers1.RobertaModel`. It is used to instantiate an RoBERTa model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the BERT `bert-base-uncased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. The :class:`~transformers1.RobertaConfig` class directly inherits :class:`~transformers1.BertConfig`. It reuses the same defaults. Please check the parent class for more information. Example:: from transformers1 import RobertaConfig, RobertaModel # Initializing a RoBERTa configuration configuration = RobertaConfig() # Initializing a model from the configuration model = RobertaModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "roberta" def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): """Constructs RobertaConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_t5.py ================================================ # coding=utf-8 # Copyright 2010, The T5 Authors and HuggingFace Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ T5 model configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", } class T5Config(PretrainedConfig): r""" :class:`~transformers1.T5Config` is the configuration class to store the configuration of a `T5Model`. Arguments: vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`. d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`. num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`. num_heads: Number of attention heads for each attention layer in the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`. intermediate_size: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob: The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. n_positions: The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings'. type_vocab_size: The vocabulary size of the `token_type_ids` passed into `T5Model`. initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). layer_norm_eps: The epsilon used by LayerNorm. """ model_type = "t5" def __init__( self, vocab_size=32128, n_positions=512, d_model=512, d_kv=64, d_ff=2048, num_layers=6, num_heads=8, relative_attention_num_buckets=32, dropout_rate=0.1, layer_norm_epsilon=1e-6, initializer_factor=1.0, is_encoder_decoder=True, pad_token_id=0, eos_token_id=1, **kwargs ): super().__init__( pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs, ) self.vocab_size = vocab_size self.n_positions = n_positions self.d_model = d_model self.d_kv = d_kv self.d_ff = d_ff self.num_layers = num_layers self.num_heads = num_heads self.relative_attention_num_buckets = relative_attention_num_buckets self.dropout_rate = dropout_rate self.layer_norm_epsilon = layer_norm_epsilon self.initializer_factor = initializer_factor @property def max_position_embeddings(self): return self.n_positions @property def hidden_size(self): return self.d_model @property def num_attention_heads(self): return self.num_heads @property def num_hidden_layers(self): return self.num_layers ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Transformer XL configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", } class TransfoXLConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.TransfoXLModel`. It is used to instantiate a Transformer XL model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `Transformer XL `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 267735): Vocabulary size of the Transformer XL model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.TransfoXLModel`. cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`): Cutoffs for the adaptive softmax d_model (:obj:`int`, optional, defaults to 1024): Dimensionality of the model's hidden states. d_embed (:obj:`int`, optional, defaults to 1024): Dimensionality of the embeddings n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. d_head (:obj:`int`, optional, defaults to 64): Dimensionality of the model's heads. d_inner (:obj:`int`, optional, defaults to 4096): Inner dimension in FF div_val (:obj:`int`, optional, defaults to 4): Divident value for adapative input and softmax pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`): Apply LayerNorm to the input instead of the output n_layer (:obj:`int`, optional, defaults to 18): Number of hidden layers in the Transformer encoder. tgt_len (:obj:`int`, optional, defaults to 128): Number of tokens to predict ext_len (:obj:`int`, optional, defaults to 0): Length of the extended context mem_len (:obj:`int`, optional, defaults to 1600): Length of the retained previous heads clamp_len (:obj:`int`, optional, defaults to 1000): use the same pos embeddings after clamp_len same_length (:obj:`boolean`, optional, defaults to :obj:`True`): Use the same attn length for all tokens proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`): True to share all but first projs, False not to share. attn_type (:obj:`int`, optional, defaults to 0): Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. sample_softmax (:obj:`int`, optional, defaults to -1): number of samples in sampled softmax adaptive (:obj:`boolean`, optional, defaults to :obj:`True`): use adaptive softmax tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`): tie the word embedding and softmax weights dropout (:obj:`float`, optional, defaults to 0.1): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. dropatt (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): Untie relative position biases init (:obj:`string`, optional, defaults to `normal`): Parameter initializer to use init_range (:obj:`float`, optional, defaults to 0.01): Parameters initialized by U(-init_range, init_range). proj_init_std (:obj:`float`, optional, defaults to 0.01): Parameters initialized by N(0, init_std) init_std (:obj:`float`, optional, defaults to 0.02): Parameters initialized by N(0, init_std) layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): The epsilon to use in the layer normalization layers Example:: from transformers1 import TransfoXLConfig, TransfoXLModel # Initializing a Transformer XL configuration configuration = TransfoXLConfig() # Initializing a model from the configuration model = TransfoXLModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "transfo-xl" def __init__( self, vocab_size=267735, cutoffs=[20000, 40000, 200000], d_model=1024, d_embed=1024, n_head=16, d_head=64, d_inner=4096, div_val=4, pre_lnorm=False, n_layer=18, tgt_len=128, ext_len=0, mem_len=1600, clamp_len=1000, same_length=True, proj_share_all_but_first=True, attn_type=0, sample_softmax=-1, adaptive=True, tie_weight=True, dropout=0.1, dropatt=0.0, untie_r=True, init="normal", init_range=0.01, proj_init_std=0.01, init_std=0.02, layer_norm_epsilon=1e-5, eos_token_id=0, **kwargs ): super().__init__(eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.cutoffs = [] self.cutoffs.extend(cutoffs) self.tie_weight = tie_weight if proj_share_all_but_first: self.tie_projs = [False] + [True] * len(self.cutoffs) else: self.tie_projs = [False] + [False] * len(self.cutoffs) self.d_model = d_model self.d_embed = d_embed self.d_head = d_head self.d_inner = d_inner self.div_val = div_val self.pre_lnorm = pre_lnorm self.n_layer = n_layer self.n_head = n_head self.tgt_len = tgt_len self.ext_len = ext_len self.mem_len = mem_len self.same_length = same_length self.attn_type = attn_type self.clamp_len = clamp_len self.sample_softmax = sample_softmax self.adaptive = adaptive self.dropout = dropout self.dropatt = dropatt self.untie_r = untie_r self.init = init self.init_range = init_range self.proj_init_std = proj_init_std self.init_std = init_std self.layer_norm_epsilon = layer_norm_epsilon @property def max_position_embeddings(self): return self.tgt_len + self.ext_len + self.mem_len @property def n_token(self): # Backward compatibility return self.vocab_size @n_token.setter def n_token(self, value): # Backward compatibility self.vocab_size = value @property def hidden_size(self): return self.d_model @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Configuration base class and utilities.""" import copy import json import logging import os from typing import Dict, Tuple from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url logger = logging.getLogger(__name__) class PretrainedConfig(object): r""" Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. Note: A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. It only affects the model's configuration. Class attributes (overridden by derived classes): - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers1.AutoConfig`. Args: finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`): Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. num_labels (:obj:`int`, `optional`, defaults to `2`): Number of classes to use when the model is a classification model (sequences/tokens) output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`): Should the model returns attentions weights. output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`): Should the model returns all hidden-states. torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`): Is the model used with Torchscript (for PyTorch models). """ model_type: str = "" def __init__(self, **kwargs): # Attributes with defaults self.output_attentions = kwargs.pop("output_attentions", False) self.output_hidden_states = kwargs.pop("output_hidden_states", False) self.use_cache = kwargs.pop("use_cache", True) # Not used by all models self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models self.use_bfloat16 = kwargs.pop("use_bfloat16", False) self.pruned_heads = kwargs.pop("pruned_heads", {}) # Is decoder is used in encoder-decoder models to differentiate encoder from decoder self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False) self.is_decoder = kwargs.pop("is_decoder", False) # Parameters for sequence generation self.max_length = kwargs.pop("max_length", 20) self.min_length = kwargs.pop("min_length", 0) self.do_sample = kwargs.pop("do_sample", False) self.early_stopping = kwargs.pop("early_stopping", False) self.num_beams = kwargs.pop("num_beams", 1) self.temperature = kwargs.pop("temperature", 1.0) self.top_k = kwargs.pop("top_k", 50) self.top_p = kwargs.pop("top_p", 1.0) self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) self.length_penalty = kwargs.pop("length_penalty", 1.0) self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0) self.bad_words_ids = kwargs.pop("bad_words_ids", None) self.num_return_sequences = kwargs.pop("num_return_sequences", 1) # Fine-tuning task arguments self.architectures = kwargs.pop("architectures", None) self.finetuning_task = kwargs.pop("finetuning_task", None) self.id2label = kwargs.pop("id2label", None) self.label2id = kwargs.pop("label2id", None) if self.id2label is not None: kwargs.pop("num_labels", None) self.id2label = dict((int(key), value) for key, value in self.id2label.items()) # Keys are always strings in JSON so convert ids to int here. else: self.num_labels = kwargs.pop("num_labels", 2) # Tokenizer arguments TODO: eventually tokenizer and models should share the same config self.prefix = kwargs.pop("prefix", None) self.bos_token_id = kwargs.pop("bos_token_id", None) self.pad_token_id = kwargs.pop("pad_token_id", None) self.eos_token_id = kwargs.pop("eos_token_id", None) self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) # task specific arguments self.task_specific_params = kwargs.pop("task_specific_params", None) # TPU arguments self.xla_device = kwargs.pop("xla_device", None) # Additional attributes without default values for key, value in kwargs.items(): try: setattr(self, key, value) except AttributeError as err: logger.error("Can't set {} with value {} for {}".format(key, value, self)) raise err @property def num_labels(self): return len(self.id2label) @num_labels.setter def num_labels(self, num_labels): self.id2label = {i: "LABEL_{}".format(i) for i in range(num_labels)} self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the :func:`~transformers1.PretrainedConfig.from_pretrained` class method. Args: save_directory (:obj:`string`): Directory where the configuration JSON file will be saved. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(save_directory, CONFIG_NAME) self.to_json_file(output_config_file, use_diff=True) logger.info("Configuration saved in {}".format(output_config_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig": r""" Instantiate a :class:`~transformers1.PretrainedConfig` (or a derived class) from a pre-trained model configuration. Args: pretrained_model_name_or_path (:obj:`string`): either: - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers1.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. cache_dir (:obj:`string`, `optional`): Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. kwargs (:obj:`Dict[str, any]`, `optional`): The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): Force to (re-)download the model weights and configuration files and override the cached versions if they exist. resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies (:obj:`Dict`, `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. return_unused_kwargs: (`optional`) bool: If False, then this function returns just the final configuration object. If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. Returns: :class:`PretrainedConfig`: An instance of a configuration object Examples:: # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a # derived class: BertConfig config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) assert config.output_attention == True config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True) assert config.output_attention == True assert unused_kwargs == {'foo': False} """ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) return cls.from_dict(config_dict, **kwargs) @classmethod def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[Dict, Dict]: """ From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a Config using `from_dict`. Parameters: pretrained_model_name_or_path (:obj:`string`): The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. Returns: :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object. """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) if os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): config_file = pretrained_model_name_or_path else: config_file = hf_bucket_url(pretrained_model_name_or_path, filename=CONFIG_NAME, use_cdn=False) try: # Load from URL or cache if already cached resolved_config_file = cached_path( config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) # Load config dict if resolved_config_file is None: raise EnvironmentError config_dict = cls._dict_from_json_file(resolved_config_file) except EnvironmentError: msg = ( f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" ) raise EnvironmentError(msg) except json.JSONDecodeError: msg = ( "Couldn't reach server at '{}' to download configuration file or " "configuration file is not a valid JSON file. " "Please check network or file content here: {}.".format(config_file, resolved_config_file) ) raise EnvironmentError(msg) if resolved_config_file == config_file: logger.info("loading configuration file {}".format(config_file)) else: logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file)) return config_dict, kwargs @classmethod def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig": """ Constructs a `Config` from a Python dictionary of parameters. Args: config_dict (:obj:`Dict[str, any]`): Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved from a pre-trained checkpoint by leveraging the :func:`~transformers1.PretrainedConfig.get_config_dict` method. kwargs (:obj:`Dict[str, any]`): Additional parameters from which to initialize the configuration object. Returns: :class:`PretrainedConfig`: An instance of a configuration object """ return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) config = cls(**config_dict) if hasattr(config, "pruned_heads"): config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) # Update config with kwargs if needed to_remove = [] for key, value in kwargs.items(): if hasattr(config, key): setattr(config, key, value) to_remove.append(key) for key in to_remove: kwargs.pop(key, None) logger.info("Model config %s", str(config)) if return_unused_kwargs: return config, kwargs else: return config @classmethod def from_json_file(cls, json_file: str) -> "PretrainedConfig": """ Constructs a `Config` from the path to a json file of parameters. Args: json_file (:obj:`string`): Path to the JSON file containing the parameters. Returns: :class:`PretrainedConfig`: An instance of a configuration object """ config_dict = cls._dict_from_json_file(json_file) return cls(**config_dict) @classmethod def _dict_from_json_file(cls, json_file: str): with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() return json.loads(text) def __eq__(self, other): return self.__dict__ == other.__dict__ def __repr__(self): return "{} {}".format(self.__class__.__name__, self.to_json_string()) def to_diff_dict(self): """ Removes all attributes from config which correspond to the default config attributes for better readability and serializes to a Python dictionary. Returns: :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ config_dict = self.to_dict() # get the default config dict default_config_dict = PretrainedConfig().to_dict() serializable_config_dict = {} # only serialize values that differ from the default config for key, value in config_dict.items(): if key not in default_config_dict or value != default_config_dict[key]: serializable_config_dict[key] = value return serializable_config_dict def to_dict(self): """ Serializes this instance to a Python dictionary. Returns: :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) if hasattr(self.__class__, "model_type"): output["model_type"] = self.__class__.model_type return output def to_json_string(self, use_diff=True): """ Serializes this instance to a JSON string. Args: use_diff (:obj:`bool`): If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON string. Returns: :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format. """ if use_diff is True: config_dict = self.to_diff_dict() else: config_dict = self.to_dict() return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" def to_json_file(self, json_file_path, use_diff=True): """ Save this instance to a json file. Args: json_file_path (:obj:`string`): Path to the JSON file in which this configuration instance's parameters will be saved. use_diff (:obj:`bool`): If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON file. """ with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string(use_diff=use_diff)) def update(self, config_dict: Dict): """ Updates attributes of this class with attributes from `config_dict`. Args: :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class. """ for key, value in config_dict.items(): setattr(self, key, value) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_xlm.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XLM configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", } class XLMConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.XLMModel`. It is used to instantiate an XLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `xlm-mlm-en-2048 `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30145): Vocabulary size of the XLM model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.XLMModel`. emb_dim (:obj:`int`, optional, defaults to 2048): Dimensionality of the encoder layers and the pooler layer. n_layer (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for the attention mechanism gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): The non-linear activation function (function or string) in the encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. causal (:obj:`boolean`, optional, defaults to :obj:`False`): Set this to `True` for the model to behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context. asm (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer. n_langs (:obj:`int`, optional, defaults to 1): The number of languages the model handles. Set to 1 for monolingual models. use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual models page `__ for information on how to use them. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): The standard deviation of the truncated_normal_initializer for initializing the embedding matrices. init_std (:obj:`int`, optional, defaults to 50257): The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. bos_index (:obj:`int`, optional, defaults to 0): The index of the beginning of sentence token in the vocabulary. eos_index (:obj:`int`, optional, defaults to 1): The index of the end of sentence token in the vocabulary. pad_index (:obj:`int`, optional, defaults to 2): The index of the padding token in the vocabulary. unk_index (:obj:`int`, optional, defaults to 3): The index of the unknown token in the vocabulary. mask_index (:obj:`int`, optional, defaults to 5): The index of the masking token in the vocabulary. is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. summary_type (:obj:`string`, optional, defaults to "first"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_first_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLMForSequenceClassification`. Add a dropout before the projection and activation start_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. end_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. mask_token_id (:obj:`int`, optional, defaults to 0): Model agnostic parameter to identify masked tokens when generating text in an MLM context. lang_id (:obj:`int`, optional, defaults to 1): The ID of the language used by the model. This parameter is used when generating text in a given language. Example:: from transformers1 import XLMConfig, XLMModel # Initializing a XLM configuration configuration = XLMConfig() # Initializing a model from the configuration model = XLMModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "xlm" def __init__( self, vocab_size=30145, emb_dim=2048, n_layers=12, n_heads=16, dropout=0.1, attention_dropout=0.1, gelu_activation=True, sinusoidal_embeddings=False, causal=False, asm=False, n_langs=1, use_lang_emb=True, max_position_embeddings=512, embed_init_std=2048 ** -0.5, layer_norm_eps=1e-12, init_std=0.02, bos_index=0, eos_index=1, pad_index=2, unk_index=3, mask_index=5, is_encoder=True, summary_type="first", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, start_n_top=5, end_n_top=5, mask_token_id=0, lang_id=0, pad_token_id=2, bos_token_id=0, **kwargs ): """Constructs XLMConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) self.vocab_size = vocab_size self.emb_dim = emb_dim self.n_layers = n_layers self.n_heads = n_heads self.dropout = dropout self.attention_dropout = attention_dropout self.gelu_activation = gelu_activation self.sinusoidal_embeddings = sinusoidal_embeddings self.causal = causal self.asm = asm self.n_langs = n_langs self.use_lang_emb = use_lang_emb self.layer_norm_eps = layer_norm_eps self.bos_index = bos_index self.eos_index = eos_index self.pad_index = pad_index self.unk_index = unk_index self.mask_index = mask_index self.is_encoder = is_encoder self.max_position_embeddings = max_position_embeddings self.embed_init_std = embed_init_std self.init_std = init_std self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_proj_to_labels = summary_proj_to_labels self.summary_first_dropout = summary_first_dropout self.start_n_top = start_n_top self.end_n_top = end_n_top self.mask_token_id = mask_token_id self.lang_id = lang_id if "n_words" in kwargs: self.n_words = kwargs["n_words"] @property def n_words(self): # For backward compatibility return self.vocab_size @n_words.setter def n_words(self, value): # For backward compatibility self.vocab_size = value @property def hidden_size(self): return self.emb_dim @property def num_attention_heads(self): return self.n_heads @property def num_hidden_layers(self): return self.n_layers ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XLM-RoBERTa configuration """ import logging from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", } class XLMRobertaConfig(RobertaConfig): """ This class overrides :class:`~transformers1.RobertaConfig`. Please check the superclass for the appropriate documentation alongside usage examples. """ model_type = "xlm-roberta" ================================================ FILE: code/bert-base-count5/pretrain/transformers1/configuration_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XLNet configuration """ import logging from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", } class XLNetConfig(PretrainedConfig): """ This is the configuration class to store the configuration of a :class:`~transformers1.XLNetModel`. It is used to instantiate an XLNet model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the `xlnet-large-cased `__ architecture. Configuration objects inherit from :class:`~transformers1.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers1.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 32000): Vocabulary size of the XLNet model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers1.XLNetModel`. d_model (:obj:`int`, optional, defaults to 1024): Dimensionality of the encoder layers and the pooler layer. n_layer (:obj:`int`, optional, defaults to 24): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. d_inner (:obj:`int`, optional, defaults to 4096): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. ff_activation (:obj:`string`, optional, defaults to "gelu"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): Untie relative position biases attn_type (:obj:`string`, optional, defaults to "bi"): The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. dropout (:obj:`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous forward pass won't be re-computed. See the `quickstart `__ for more information. reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`): The number of tokens in the current batch to be cached and reused in the future. bi_data (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use bidirectional input pipeline. Usually set to `True` during pretraining and `False` during finetuning. clamp_len (:obj:`int`, optional, defaults to -1): Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping. same_length (:obj:`boolean`, optional, defaults to :obj:`False`): Whether to use the same attention length for each token. summary_type (:obj:`string`, optional, defaults to "last"): Argument used when doing sequence summary. Used in for the multiple choice head in :class:transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. Add a projection after the vector extraction summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. 'tanh' => add a tanh activation to the output, Other => no activation. summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_last_dropout (:obj:`float`, optional, defaults to 0.1): Argument used when doing sequence summary. Used in for the multiple choice head in :class:`~transformers1.XLNetForSequenceClassification` and :class:`~transformers1.XLNetForMultipleChoice`. Add a dropout after the projection and activation start_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. end_n_top (:obj:`int`, optional, defaults to 5): Used in the SQuAD evaluation script for XLM and XLNet. Example:: from transformers1 import XLNetConfig, XLNetModel # Initializing a XLNet configuration configuration = XLNetConfig() # Initializing a model from the configuration model = XLNetModel(configuration) # Accessing the model configuration configuration = model.config """ model_type = "xlnet" def __init__( self, vocab_size=32000, d_model=1024, n_layer=24, n_head=16, d_inner=4096, ff_activation="gelu", untie_r=True, attn_type="bi", initializer_range=0.02, layer_norm_eps=1e-12, dropout=0.1, mem_len=None, reuse_len=None, bi_data=False, clamp_len=-1, same_length=False, summary_type="last", summary_use_proj=True, summary_activation="tanh", summary_last_dropout=0.1, start_n_top=5, end_n_top=5, pad_token_id=5, bos_token_id=1, eos_token_id=2, **kwargs ): """Constructs XLNetConfig. """ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.d_model = d_model self.n_layer = n_layer self.n_head = n_head assert d_model % n_head == 0 self.d_head = d_model // n_head self.ff_activation = ff_activation self.d_inner = d_inner self.untie_r = untie_r self.attn_type = attn_type self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.dropout = dropout self.mem_len = mem_len self.reuse_len = reuse_len self.bi_data = bi_data self.clamp_len = clamp_len self.same_length = same_length self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_last_dropout = summary_last_dropout self.start_n_top = start_n_top self.end_n_top = end_n_top self.bos_token_id = bos_token_id self.pad_token_id = pad_token_id self.eos_token_id = eos_token_id @property def max_position_embeddings(self): return -1 @property def n_token(self): # Backward compatibility return self.vocab_size @n_token.setter def n_token(self, value): # Backward compatibility self.vocab_size = value @property def hidden_size(self): return self.d_model @property def num_attention_heads(self): return self.n_head @property def num_hidden_layers(self): return self.n_layer ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_albert_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert ALBERT checkpoint.""" import argparse import logging import torch from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_albert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--albert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained ALBERT model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_bart_original_pytorch_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert BART checkpoint.""" import argparse import logging import os from pathlib import Path import fairseq import torch from packaging import version from transformers import ( BartConfig, BartForConditionalGeneration, BartForSequenceClassification, BartModel, BartTokenizer, ) from transformers.modeling_bart import _make_linear_from_emb FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"] extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification} if version.parse(fairseq.__version__) < version.parse("0.9.0"): raise Exception("requires fairseq >= 0.9.0") logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) SAMPLE_TEXT = " Hello world! cécé herlolip" mnli_rename_keys = [ ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), ] def remove_ignore_keys_(state_dict): ignore_keys = [ "encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor", ] for k in ignore_keys: state_dict.pop(k, None) def rename_key(dct, old, new): val = dct.pop(old) dct[new] = val def load_xsum_checkpoint(checkpoint_path): """Checkpoint path should end in model.pt""" sd = torch.load(checkpoint_path, map_location="cpu") hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval() hub_interface.model.load_state_dict(sd["model"]) return hub_interface def convert_checkpoint_from_disk(checkpoint_path, **config_kwargs): state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] remove_ignore_keys_(state_dict) vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] mbart_config = BartConfig(vocab_size=vocab_size, **config_kwargs) model = BartForConditionalGeneration(mbart_config) model.model.load_state_dict(state_dict) if hasattr(model, "lm_head"): model.lm_head = _make_linear_from_emb(model.model.shared) return model @torch.no_grad() def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): """ Copy/paste/tweak model's weights to our BERT structure. """ if not os.path.exists(checkpoint_path): bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() else: bart = load_xsum_checkpoint(checkpoint_path) bart.model.upgrade_state_dict(bart.model.state_dict()) if hf_checkpoint_name is None: hf_checkpoint_name = checkpoint_path.replace(".", "-") config = BartConfig.from_pretrained(hf_checkpoint_name) tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) assert torch.eq(tokens, tokens2).all() if checkpoint_path == "bart.large.mnli": state_dict = bart.state_dict() remove_ignore_keys_(state_dict) state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] for src, dest in mnli_rename_keys: rename_key(state_dict, src, dest) model = BartForSequenceClassification(config).eval() model.load_state_dict(state_dict) fairseq_output = bart.predict("mnli", tokens, return_logits=True) new_model_outputs = model(tokens)[0] # logits else: # no classification heads to worry about state_dict = bart.model.state_dict() remove_ignore_keys_(state_dict) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] fairseq_output = bart.extract_features(tokens) if hf_checkpoint_name == "facebook/bart-large": model = BartModel(config).eval() model.load_state_dict(state_dict) new_model_outputs = model(tokens).model[0] else: model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt model.model.load_state_dict(state_dict) if hasattr(model, "lm_head"): model.lm_head = _make_linear_from_emb(model.model.shared) new_model_outputs = model.model(tokens)[0] # Check results assert fairseq_output.shape == new_model_outputs.shape assert (fairseq_output == new_model_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." ) parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") parser.add_argument( "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum" ) args = parser.parse_args() convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_bert_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert BERT checkpoint.""" import argparse import logging import torch from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_bert_pytorch_checkpoint_to_original_tf.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" import argparse import os import numpy as np import tensorflow as tf import torch from transformers import BertModel def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): """ :param model:BertModel Pytorch model instance to be converted :param ckpt_dir: Tensorflow model directory :param model_name: model name :return: Currently supported HF models: Y BertModel N BertForMaskedLM N BertForPreTraining N BertForMultipleChoice N BertForNextSentencePrediction N BertForSequenceClassification N BertForQuestionAnswering """ tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") var_map = ( ("layer.", "layer_"), ("word_embeddings.weight", "word_embeddings"), ("position_embeddings.weight", "position_embeddings"), ("token_type_embeddings.weight", "token_type_embeddings"), (".", "/"), ("LayerNorm/weight", "LayerNorm/gamma"), ("LayerNorm/bias", "LayerNorm/beta"), ("weight", "kernel"), ) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) state_dict = model.state_dict() def to_tf_var_name(name: str): for patt, repl in iter(var_map): name = name.replace(patt, repl) return "bert/{}".format(name) def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) session.run(tf_var) return tf_var tf.reset_default_graph() with tf.Session() as session: for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() if any([x in var_name for x in tensors_to_transpose]): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf.keras.backend.set_value(tf_var, torch_tensor) tf_weight = session.run(tf_var) print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) saver = tf.train.Saver(tf.trainable_variables()) saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) def main(raw_args=None): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") parser.add_argument( "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" ) parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") args = parser.parse_args(raw_args) model = BertModel.from_pretrained( pretrained_model_name_or_path=args.model_name, state_dict=torch.load(args.pytorch_model_path), cache_dir=args.cache_dir, ) convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) if __name__ == "__main__": main() ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py ================================================ import argparse import os import torch from transformers.file_utils import WEIGHTS_NAME DIALOGPT_MODELS = ["small", "medium", "large"] OLD_KEY = "lm_head.decoder.weight" NEW_KEY = "lm_head.weight" def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): d = torch.load(checkpoint_path) d[NEW_KEY] = d.pop(OLD_KEY) os.makedirs(pytorch_dump_folder_path, exist_ok=True) torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dialogpt_path", default=".", type=str) args = parser.parse_args() for MODEL in DIALOGPT_MODELS: checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" convert_dialogpt_checkpoint( checkpoint_path, pytorch_dump_folder_path, ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_electra_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert ELECTRA checkpoint.""" import argparse import logging import torch from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): # Initialise PyTorch model config = ElectraConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) if discriminator_or_generator == "discriminator": model = ElectraForPreTraining(config) elif discriminator_or_generator == "generator": model = ElectraForMaskedLM(config) else: raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") # Load weights from tf checkpoint load_tf_weights_in_electra( model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator ) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--discriminator_or_generator", default=None, type=str, required=True, help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " "'generator'.", ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch( args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_gpt2_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert OpenAI GPT checkpoint.""" import argparse import logging import torch from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 logging.basicConfig(level=logging.INFO) def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): # Construct model if gpt2_config_file == "": config = GPT2Config() else: config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--gpt2_config_file", default="", type=str, help="An optional config json file corresponding to the pre-trained OpenAI model. \n" "This specifies the model architecture.", ) args = parser.parse_args() convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_graph_to_onnx.py ================================================ from argparse import ArgumentParser from os import listdir, makedirs from os.path import abspath, dirname, exists from typing import Dict, List, Optional, Tuple from transformers import is_tf_available, is_torch_available from transformers.pipelines import Pipeline, pipeline from transformers.tokenization_utils import BatchEncoding class OnnxConverterArgumentParser(ArgumentParser): """ Wraps all the script arguments supported to export transformers1 models to ONNX IR """ def __init__(self): super(OnnxConverterArgumentParser, self).__init__("ONNX Converter") self.add_argument("--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)") self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") self.add_argument("--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model") self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") self.add_argument("--check-loading", action="store_true", help="Check ONNX is able to load the model") self.add_argument("--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb") self.add_argument("output") def ensure_valid_input(model, tokens, input_names): """ Ensure input are presented in the correct order, without any None Args: model: The model used to forward the input data tokens: BatchEncoding holding the input data input_names: The name of the inputs Returns: Tuple """ model_args_name = model.forward.__code__.co_varnames ordered_input_names = [] model_args = [] for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument if arg_name in input_names: ordered_input_names.append(arg_name) model_args.append(tokens[arg_name]) else: break return ordered_input_names, tuple(model_args) def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: def build_shape_dict(tensor, is_input: bool, seq_len: int): if isinstance(tensor, (tuple, list)): return [build_shape_dict(t, is_input, seq_len) for t in tensor] else: # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...) axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} if is_input: if len(tensor.shape) == 2: axes[1] = "sequence" else: raise ValueError("Unable to infer tensor axes ({})".format(len(tensor.shape))) else: seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] axes.update({dim: "sequence" for dim in seq_axes}) return axes tokens = nlp.tokenizer.encode_plus("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) if not isinstance(outputs, (list, tuple)): outputs = (outputs,) # Generate input names & axes input_vars = list(tokens.keys()) input_dynamic_axes = {k: build_shape_dict(v, True, seq_len) for k, v in tokens.items()} # flatten potentially grouped outputs (past for gpt2, attentions) outputs_flat = [] for output in outputs: if isinstance(output, (tuple, list)): outputs_flat.extend(output) else: outputs_flat.append(output) # Generate output names & axes output_names = ["output_{}".format(i) for i in range(len(outputs_flat))] output_dynamic_axes = {k: build_shape_dict(v, False, seq_len) for k, v in zip(output_names, outputs_flat)} # Create the aggregated axes representation dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) return input_vars, output_names, dynamic_axes, tokens def load_graph_from_args(framework: str, model: str, tokenizer: Optional[str] = None) -> Pipeline: # If no tokenizer provided if tokenizer is None: tokenizer = model print("Loading pipeline (model: {}, tokenizer: {})".format(model, tokenizer)) # Allocate tokenizer and model return pipeline("feature-extraction", model=model, tokenizer=tokenizer, framework=framework) def convert_pytorch(nlp: Pipeline, opset: int, output: str, use_external_format: bool): if not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export print("PyTorch: {}".format(torch.__version__)) with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) export( nlp.model, model_args, f=output, input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=True, use_external_data_format=use_external_format, enable_onnx_checker=True, opset_version=opset, ) def convert_tensorflow(nlp: Pipeline, opset: int, output: str): if not is_tf_available(): raise Exception( "Cannot convert {} because TF is not installed. Please install torch first.".format(args.model) ) print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") try: import tensorflow as tf from keras2onnx import convert_keras, save_model, __version__ as k2ov print("TensorFlow: {}, keras2onnx: {}".format(tf.version.VERSION, k2ov)) # Build input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) save_model(onnx_model, output) except ImportError as e: raise Exception( "Cannot import {} required to convert TF model to ONNX. Please install {} first.".format(e.name, e.name) ) def convert( framework: str, model: str, output: str, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, ): print("ONNX opset version set to: {}".format(opset)) # Load the pipeline nlp = load_graph_from_args(framework, model, tokenizer) parent = dirname(output) if not exists(parent): print("Creating folder {}".format(parent)) makedirs(parent) elif len(listdir(parent)) > 0: raise Exception("Folder {} is not empty, aborting conversion".format(parent)) # Export the graph if framework == "pt": convert_pytorch(nlp, opset, output, use_external_format) else: convert_tensorflow(nlp, opset, output) def verify(path: str): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException print("Checking ONNX model loading from: {}".format(path)) try: onnx_options = SessionOptions() _ = InferenceSession(path, onnx_options, providers=["CPUExecutionProvider"]) print("Model correctly loaded") except RuntimeException as re: print("Error while loading the model: {}".format(re)) if __name__ == "__main__": parser = OnnxConverterArgumentParser() args = parser.parse_args() # Make sure output is absolute path args.output = abspath(args.output) try: # Convert convert(args.framework, args.model, args.output, args.opset, args.tokenizer, args.use_external_format) # And verify if args.check_loading: verify(args.output) except Exception as e: print("Error while converting the model: {}".format(e)) exit(1) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_longformer_original_pytorch_lightning_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert RoBERTa checkpoint.""" import argparse import pytorch_lightning as pl import torch from transformers.modeling_longformer import LongformerForQuestionAnswering, LongformerModel class LightningModel(pl.LightningModule): def __init__(self, model): super().__init__() self.model = model self.num_labels = 2 self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) # implement only because lighning requires to do so def forward(self): pass def convert_longformer_qa_checkpoint_to_pytorch( longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str ): # load longformer model from model identifier longformer = LongformerModel.from_pretrained(longformer_model) lightning_model = LightningModel(longformer) ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu")) lightning_model.load_state_dict(ckpt["state_dict"]) # init longformer question answering model longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model) # transfer weights longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict()) longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict()) longformer_for_qa.eval() # save model longformer_for_qa.save_pretrained(pytorch_dump_folder_path) print("Conversion succesful. Model saved under {}".format(pytorch_dump_folder_path)) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--longformer_model", default=None, type=str, required=True, help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.", ) parser.add_argument( "--longformer_question_answering_ckpt_path", default=None, type=str, required=True, help="Path the official PyTorch Lighning Checkpoint.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_longformer_qa_checkpoint_to_pytorch( args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_marian_to_pytorch.py ================================================ import argparse import json import os import shutil import warnings from pathlib import Path from typing import Dict, List, Union from zipfile import ZipFile import numpy as np import torch from tqdm import tqdm from transformers import MarianConfig, MarianMTModel, MarianTokenizer from transformers.hf_api import HfApi def remove_prefix(text: str, prefix: str): if text.startswith(prefix): return text[len(prefix) :] return text # or whatever def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict): sd = {} for k in opus_dict: if not k.startswith(layer_prefix): continue stripped = remove_prefix(k, layer_prefix) v = opus_dict[k].T # besides embeddings, everything must be transposed. sd[converter[stripped]] = torch.tensor(v).squeeze() return sd def load_layers_(layer_lst: torch.nn.ModuleList, opus_state: dict, converter, is_decoder=False): for i, layer in enumerate(layer_lst): layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_" sd = convert_encoder_layer(opus_state, layer_tag, converter) layer.load_state_dict(sd, strict=True) def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]: """Find models that can accept src_lang as input and return tgt_lang as output.""" prefix = "Helsinki-NLP/opus-mt-" api = HfApi() model_list = api.model_list() model_ids = [x.modelId for x in model_list if x.modelId.startswith("Helsinki-NLP")] src_and_targ = [ remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m ] # + cant be loaded. matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b] return matching def add_emb_entries(wemb, final_bias, n_special_tokens=1): vsize, d_model = wemb.shape embs_to_add = np.zeros((n_special_tokens, d_model)) new_embs = np.concatenate([wemb, embs_to_add]) bias_to_add = np.zeros((n_special_tokens, 1)) new_bias = np.concatenate((final_bias, bias_to_add), axis=1) return new_embs, new_bias def _cast_yaml_str(v): bool_dct = {"true": True, "false": False} if not isinstance(v, str): return v elif v in bool_dct: return bool_dct[v] try: return int(v) except (TypeError, ValueError): return v def cast_marian_config(raw_cfg: Dict[str, str]) -> Dict: return {k: _cast_yaml_str(v) for k, v in raw_cfg.items()} CONFIG_KEY = "special:model.yml" def load_config_from_state_dict(opus_dict): import yaml cfg_str = "".join([chr(x) for x in opus_dict[CONFIG_KEY]]) yaml_cfg = yaml.load(cfg_str[:-1], Loader=yaml.BaseLoader) return cast_marian_config(yaml_cfg) def find_model_file(dest_dir): # this one better model_files = list(Path(dest_dir).glob("*.npz")) assert len(model_files) == 1, model_files model_file = model_files[0] return model_file # Group Names Logic: change long opus model names to something shorter, like opus-mt-en-ROMANCE ROM_GROUP = "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la" GROUPS = [ ("cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "ZH"), (ROM_GROUP, "ROMANCE"), ("de+nl+fy+af+da+fo+is+no+nb+nn+sv", "NORTH_EU"), ("da+fo+is+no+nb+nn+sv", "SCANDINAVIA"), ("se+sma+smj+smn+sms", "SAMI"), ("nb_NO+nb+nn_NO+nn+nog+no_nb+no", "NORWAY"), ("ga+cy+br+gd+kw+gv", "CELTIC"), # https://en.wikipedia.org/wiki/Insular_Celtic_languages ] GROUP_TO_OPUS_NAME = { "opus-mt-ZH-de": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-de", "opus-mt-ZH-fi": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi", "opus-mt-ZH-sv": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-sv", "opus-mt-SCANDINAVIA-SCANDINAVIA": "da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+sv", "opus-mt-NORTH_EU-NORTH_EU": "de+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+sv", "opus-mt-de-ZH": "de-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "opus-mt-en_el_es_fi-en_el_es_fi": "en+el+es+fi-en+el+es+fi", "opus-mt-en-ROMANCE": "en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO" "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR" "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la", "opus-mt-en-CELTIC": "en-ga+cy+br+gd+kw+gv", "opus-mt-es-NORWAY": "es-nb_NO+nb+nn_NO+nn+nog+no_nb+no", "opus-mt-fi_nb_no_nn_ru_sv_en-SAMI": "fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms", "opus-mt-fi-ZH": "fi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "opus-mt-fi-NORWAY": "fi-nb_NO+nb+nn_NO+nn+nog+no_nb+no", "opus-mt-ROMANCE-en": "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO" "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR" "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en", "opus-mt-CELTIC-en": "ga+cy+br+gd+kw+gv-en", "opus-mt-sv-ZH": "sv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "opus-mt-sv-NORWAY": "sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no", } OPUS_GITHUB_URL = "https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/" ORG_NAME = "Helsinki-NLP/" def convert_opus_name_to_hf_name(x): for substr, grp_name in GROUPS: x = x.replace(substr, grp_name) return x.replace("+", "_") def convert_hf_name_to_opus_name(hf_model_name): """Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.""" hf_model_name = remove_prefix(hf_model_name, ORG_NAME) if hf_model_name in GROUP_TO_OPUS_NAME: opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name] else: opus_w_prefix = hf_model_name.replace("_", "+") return remove_prefix(opus_w_prefix, "opus-mt-") def write_model_card( hf_model_name: str, repo_path="OPUS-MT-train/models/", dry_run=False, model_card_dir=Path("marian_converted/model_cards/Helsinki-NLP/"), ) -> str: """Copy the most recent model's readme section from opus, and add metadata. upload command: s3cmd sync --recursive model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ """ hf_model_name = remove_prefix(hf_model_name, ORG_NAME) opus_name: str = convert_hf_name_to_opus_name(hf_model_name) opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")] readme_url = OPUS_GITHUB_URL + f"{opus_name}/README.md" s, t = ",".join(opus_src), ",".join(opus_tgt) extra_markdown = f"### {hf_model_name}\n\n* source languages: {s}\n* target languages: {t}\n* OPUS readme: [{opus_name}]({readme_url})\n" # combine with opus markdown opus_readme_path = Path(f"{repo_path}{opus_name}/README.md") assert opus_readme_path.exists(), opus_readme_path content = opus_readme_path.open().read() content = content.split("\n# ")[-1] # Get the lowest level 1 header in the README -- the most recent model. content = "*".join(content.split("*")[1:]) content = extra_markdown + "\n* " + content.replace("download", "download original weights") if dry_run: return content # Save string to model_cards/hf_model_name/readme.md model_card_dir.mkdir(exist_ok=True) sub_dir = model_card_dir / hf_model_name sub_dir.mkdir(exist_ok=True) dest = sub_dir / "README.md" dest.open("w").write(content) return content def get_clean_model_id_mapping(multiling_model_ids): return {x: convert_opus_name_to_hf_name(x) for x in multiling_model_ids} def make_registry(repo_path="Opus-MT-train/models"): if not (Path(repo_path) / "fr-en" / "README.md").exists(): raise ValueError( f"repo_path:{repo_path} does not exist: " "You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling." ) results = {} for p in Path(repo_path).ls(): n_dash = p.name.count("-") if n_dash == 0: continue else: lns = list(open(p / "README.md").readlines()) results[p.name] = _parse_readme(lns) return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()] def convert_all_sentencepiece_models(model_list=None, repo_path=None): """Requires 300GB""" save_dir = Path("marian_ckpt") dest_dir = Path("marian_converted") dest_dir.mkdir(exist_ok=True) if model_list is None: model_list: list = make_registry(repo_path=repo_path) for k, prepro, download, test_set_url in tqdm(model_list): if "SentencePiece" not in prepro: # dont convert BPE models. continue if not os.path.exists(save_dir / k / "pytorch_model.bin"): download_and_unzip(download, save_dir / k) pair_name = convert_opus_name_to_hf_name(k) convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}") def lmap(f, x) -> List: return list(map(f, x)) def fetch_test_set(test_set_url): import wget fname = wget.download(test_set_url, "opus_test.txt") lns = Path(fname).open().readlines() src = lmap(str.strip, lns[::4]) gold = lmap(str.strip, lns[1::4]) mar_model = lmap(str.strip, lns[2::4]) assert len(gold) == len(mar_model) == len(src) os.remove(fname) return src, mar_model, gold def convert_whole_dir(path=Path("marian_ckpt/")): for subdir in tqdm(list(path.ls())): dest_dir = f"marian_converted/{subdir.name}" if (dest_dir / "pytorch_model.bin").exists(): continue convert(source_dir, dest_dir) def _parse_readme(lns): """Get link and metadata from opus model card equivalent.""" subres = {} for ln in [x.strip() for x in lns]: if not ln.startswith("*"): continue ln = ln[1:].strip() for k in ["download", "dataset", "models", "model", "pre-processing"]: if ln.startswith(k): break else: continue if k in ["dataset", "model", "pre-processing"]: splat = ln.split(":") _, v = splat subres[k] = v elif k == "download": v = ln.split("(")[-1][:-1] subres[k] = v return subres def save_tokenizer_config(dest_dir: Path): dname = dest_dir.name.split("-") dct = dict(target_lang=dname[-1], source_lang="-".join(dname[:-1])) save_json(dct, dest_dir / "tokenizer_config.json") def add_to_vocab_(vocab: Dict[str, int], special_tokens: List[str]): start = max(vocab.values()) + 1 added = 0 for tok in special_tokens: if tok in vocab: continue vocab[tok] = start + added added += 1 return added def find_vocab_file(model_dir): return list(model_dir.glob("*vocab.yml"))[0] def add_special_tokens_to_vocab(model_dir: Path) -> None: vocab = load_yaml(find_vocab_file(model_dir)) vocab = {k: int(v) for k, v in vocab.items()} num_added = add_to_vocab_(vocab, [""]) print(f"added {num_added} tokens to vocab") save_json(vocab, model_dir / "vocab.json") save_tokenizer_config(model_dir) def save_tokenizer(self, save_directory): dest = Path(save_directory) src_path = Path(self.init_kwargs["source_spm"]) for dest_name in {"source.spm", "target.spm", "tokenizer_config.json"}: shutil.copyfile(src_path.parent / dest_name, dest / dest_name) save_json(self.encoder, dest / "vocab.json") def check_equal(marian_cfg, k1, k2): v1, v2 = marian_cfg[k1], marian_cfg[k2] assert v1 == v2, f"hparams {k1},{k2} differ: {v1} != {v2}" def check_marian_cfg_assumptions(marian_cfg): assumed_settings = { "tied-embeddings-all": True, "layer-normalization": False, "right-left": False, "transformer-ffn-depth": 2, "transformer-aan-depth": 2, "transformer-no-projection": False, "transformer-postprocess-emb": "d", "transformer-postprocess": "dan", # Dropout, add, normalize "transformer-preprocess": "", "type": "transformer", "ulr-dim-emb": 0, "dec-cell-base-depth": 2, "dec-cell-high-depth": 1, "transformer-aan-nogate": False, } for k, v in assumed_settings.items(): actual = marian_cfg[k] assert actual == v, f"Unexpected config value for {k} expected {v} got {actual}" check_equal(marian_cfg, "transformer-ffn-activation", "transformer-aan-activation") check_equal(marian_cfg, "transformer-ffn-depth", "transformer-aan-depth") check_equal(marian_cfg, "transformer-dim-ffn", "transformer-dim-aan") BIAS_KEY = "decoder_ff_logit_out_b" BART_CONVERTER = { # for each encoder and decoder layer "self_Wq": "self_attn.q_proj.weight", "self_Wk": "self_attn.k_proj.weight", "self_Wv": "self_attn.v_proj.weight", "self_Wo": "self_attn.out_proj.weight", "self_bq": "self_attn.q_proj.bias", "self_bk": "self_attn.k_proj.bias", "self_bv": "self_attn.v_proj.bias", "self_bo": "self_attn.out_proj.bias", "self_Wo_ln_scale": "self_attn_layer_norm.weight", "self_Wo_ln_bias": "self_attn_layer_norm.bias", "ffn_W1": "fc1.weight", "ffn_b1": "fc1.bias", "ffn_W2": "fc2.weight", "ffn_b2": "fc2.bias", "ffn_ffn_ln_scale": "final_layer_norm.weight", "ffn_ffn_ln_bias": "final_layer_norm.bias", # Decoder Cross Attention "context_Wk": "encoder_attn.k_proj.weight", "context_Wo": "encoder_attn.out_proj.weight", "context_Wq": "encoder_attn.q_proj.weight", "context_Wv": "encoder_attn.v_proj.weight", "context_bk": "encoder_attn.k_proj.bias", "context_bo": "encoder_attn.out_proj.bias", "context_bq": "encoder_attn.q_proj.bias", "context_bv": "encoder_attn.v_proj.bias", "context_Wo_ln_scale": "encoder_attn_layer_norm.weight", "context_Wo_ln_bias": "encoder_attn_layer_norm.bias", } class OpusState: def __init__(self, source_dir): npz_path = find_model_file(source_dir) self.state_dict = np.load(npz_path) cfg = load_config_from_state_dict(self.state_dict) assert cfg["dim-vocabs"][0] == cfg["dim-vocabs"][1] assert "Wpos" not in self.state_dict self.state_dict = dict(self.state_dict) self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1) self.pad_token_id = self.wemb.shape[0] - 1 cfg["vocab_size"] = self.pad_token_id + 1 # self.state_dict['Wemb'].sha self.state_keys = list(self.state_dict.keys()) if "Wtype" in self.state_dict: raise ValueError("found Wtype key") self._check_layer_entries() self.source_dir = source_dir self.cfg = cfg hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape assert hidden_size == cfg["dim-emb"] == 512 # Process decoder.yml decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml")) check_marian_cfg_assumptions(cfg) self.hf_config = MarianConfig( vocab_size=cfg["vocab_size"], decoder_layers=cfg["dec-depth"], encoder_layers=cfg["enc-depth"], decoder_attention_heads=cfg["transformer-heads"], encoder_attention_heads=cfg["transformer-heads"], decoder_ffn_dim=cfg["transformer-dim-ffn"], encoder_ffn_dim=cfg["transformer-dim-ffn"], d_model=cfg["dim-emb"], activation_function=cfg["transformer-aan-activation"], pad_token_id=self.pad_token_id, eos_token_id=0, bos_token_id=0, max_position_embeddings=cfg["dim-emb"], scale_embedding=True, normalize_embedding="n" in cfg["transformer-preprocess"], static_position_embeddings=not cfg["transformer-train-position-embeddings"], dropout=0.1, # see opus-mt-train repo/transformer-dropout param. # default: add_final_layer_norm=False, num_beams=decoder_yml["beam-size"], decoder_start_token_id=self.pad_token_id, bad_words_ids=[[self.pad_token_id]], max_length=512, ) def _check_layer_entries(self): self.encoder_l1 = self.sub_keys("encoder_l1") self.decoder_l1 = self.sub_keys("decoder_l1") self.decoder_l2 = self.sub_keys("decoder_l2") if len(self.encoder_l1) != 16: warnings.warn(f"Expected 16 keys for each encoder layer, got {len(self.encoder_l1)}") if len(self.decoder_l1) != 26: warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}") if len(self.decoder_l2) != 26: warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}") @property def extra_keys(self): extra = [] for k in self.state_keys: if ( k.startswith("encoder_l") or k.startswith("decoder_l") or k in [CONFIG_KEY, "Wemb", "Wpos", "decoder_ff_logit_out_b"] ): continue else: extra.append(k) return extra def sub_keys(self, layer_prefix): return [remove_prefix(k, layer_prefix) for k in self.state_dict if k.startswith(layer_prefix)] def load_marian_model(self) -> MarianMTModel: state_dict, cfg = self.state_dict, self.hf_config assert cfg.static_position_embeddings model = MarianMTModel(cfg) assert "hidden_size" not in cfg.to_dict() load_layers_( model.model.encoder.layers, state_dict, BART_CONVERTER, ) load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True) # handle tensors not associated with layers wemb_tensor = torch.nn.Parameter(torch.FloatTensor(self.wemb)) bias_tensor = torch.nn.Parameter(torch.FloatTensor(self.final_bias)) model.model.shared.weight = wemb_tensor model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared model.final_logits_bias = bias_tensor if "Wpos" in state_dict: print("Unexpected: got Wpos") wpos_tensor = torch.tensor(state_dict["Wpos"]) model.model.encoder.embed_positions.weight = wpos_tensor model.model.decoder.embed_positions.weight = wpos_tensor if cfg.normalize_embedding: assert "encoder_emb_ln_scale_pre" in state_dict raise NotImplementedError("Need to convert layernorm_embedding") assert not self.extra_keys, f"Failed to convert {self.extra_keys}" assert model.model.shared.padding_idx == self.pad_token_id return model def download_and_unzip(url, dest_dir): try: import wget except ImportError: raise ImportError("you must pip install wget") filename = wget.download(url) unzip(filename, dest_dir) os.remove(filename) def convert(source_dir: Path, dest_dir): dest_dir = Path(dest_dir) dest_dir.mkdir(exist_ok=True) add_special_tokens_to_vocab(source_dir) tokenizer = MarianTokenizer.from_pretrained(str(source_dir)) save_tokenizer(tokenizer, dest_dir) opus_state = OpusState(source_dir) assert opus_state.cfg["vocab_size"] == len(tokenizer.encoder) # save_json(opus_state.cfg, dest_dir / "marian_original_config.json") # ^^ Save human readable marian config for debugging model = opus_state.load_marian_model() model.save_pretrained(dest_dir) model.from_pretrained(dest_dir) # sanity check if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--src", type=str, help="path to marian model dir", default="en-de") parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.") args = parser.parse_args() source_dir = Path(args.src) assert source_dir.exists() dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest convert(source_dir, dest_dir) def load_yaml(path): import yaml with open(path) as f: return yaml.load(f, Loader=yaml.BaseLoader) def save_json(content: Union[Dict, List], path: str) -> None: with open(path, "w") as f: json.dump(content, f) def unzip(zip_path: str, dest_dir: str) -> None: with ZipFile(zip_path, "r") as zipObj: zipObj.extractall(dest_dir) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_openai_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert OpenAI GPT checkpoint.""" import argparse import logging import torch from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt logging.basicConfig(level=logging.INFO) def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): # Construct model if openai_config_file == "": config = OpenAIGPTConfig() else: config = OpenAIGPTConfig.from_json_file(openai_config_file) model = OpenAIGPTModel(config) # Load weights from numpy load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--openai_checkpoint_folder_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--openai_config_file", default="", type=str, help="An optional config json file corresponding to the pre-trained OpenAI model. \n" "This specifies the model architecture.", ) args = parser.parse_args() convert_openai_checkpoint_to_pytorch( args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_pytorch_checkpoint_to_tf2.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Convert pytorch checkpoints to TensorFlow """ import argparse import logging import os from transformers import ( ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, WEIGHTS_NAME, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig, BertConfig, CamembertConfig, CTRLConfig, DistilBertConfig, ElectraConfig, FlaubertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, T5Config, TFAlbertForPreTraining, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, TFCamembertForMaskedLM, TFCTRLLMHeadModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFElectraForPreTraining, TFFlaubertWithLMHeadModel, TFGPT2LMHeadModel, TFOpenAIGPTLMHeadModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFT5ForConditionalGeneration, TFTransfoXLLMHeadModel, TFXLMRobertaForMaskedLM, TFXLMWithLMHeadModel, TFXLNetLMHeadModel, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLNetConfig, cached_path, hf_bucket_url, is_torch_available, load_pytorch_checkpoint_in_tf2_model, ) if is_torch_available(): import torch import numpy as np from transformers import ( BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, GPT2LMHeadModel, XLNetLMHeadModel, XLMWithLMHeadModel, XLMRobertaForMaskedLM, TransfoXLLMHeadModel, OpenAIGPTLMHeadModel, RobertaForMaskedLM, RobertaForSequenceClassification, CamembertForMaskedLM, FlaubertWithLMHeadModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, CTRLLMHeadModel, AlbertForPreTraining, T5ForConditionalGeneration, ElectraForPreTraining, ) logging.basicConfig(level=logging.INFO) MODEL_CLASSES = { "bert": (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,), "bert-large-uncased-whole-word-masking-finetuned-squad": ( BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "bert-large-cased-whole-word-masking-finetuned-squad": ( BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "bert-base-cased-finetuned-mrpc": ( BertConfig, TFBertForSequenceClassification, BertForSequenceClassification, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "gpt2": (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,), "xlnet": (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,), "xlm": (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,), "xlm-roberta": ( XLMRobertaConfig, TFXLMRobertaForMaskedLM, XLMRobertaForMaskedLM, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "transfo-xl": ( TransfoXLConfig, TFTransfoXLLMHeadModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "openai-gpt": ( OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "roberta": (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,), "roberta-large-mnli": ( RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "camembert": ( CamembertConfig, TFCamembertForMaskedLM, CamembertForMaskedLM, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "flaubert": ( FlaubertConfig, TFFlaubertWithLMHeadModel, FlaubertWithLMHeadModel, FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "distilbert": ( DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "distilbert-base-distilled-squad": ( DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), "ctrl": (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,), "albert": (AlbertConfig, TFAlbertForPreTraining, AlbertForPreTraining, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,), "t5": (T5Config, TFT5ForConditionalGeneration, T5ForConditionalGeneration, T5_PRETRAINED_CONFIG_ARCHIVE_MAP,), "electra": (ElectraConfig, TFElectraForPreTraining, ElectraForPreTraining, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,), } def convert_pt_checkpoint_to_tf( model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True ): if model_type not in MODEL_CLASSES: raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys()))) config_class, model_class, pt_model_class, aws_config_map = MODEL_CLASSES[model_type] # Initialise TF model if config_file in aws_config_map: config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models) config = config_class.from_json_file(config_file) config.output_hidden_states = True config.output_attentions = True print("Building TensorFlow model from configuration: {}".format(str(config))) tf_model = model_class(config) # Load weights from tf checkpoint if pytorch_checkpoint_path in aws_config_map.keys(): pytorch_checkpoint_url = hf_bucket_url(pytorch_checkpoint_path, filename=WEIGHTS_NAME) pytorch_checkpoint_path = cached_path(pytorch_checkpoint_url, force_download=not use_cached_models) # Load PyTorch checkpoint in tf2 model: tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) if compare_with_pt_model: tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu") pt_model = pt_model_class.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=state_dict ) with torch.no_grad(): pto = pt_model(**pt_model.dummy_inputs) np_pt = pto[0].numpy() np_tf = tfo[0].numpy() diff = np.amax(np.abs(np_pt - np_tf)) print("Max absolute difference between models outputs {}".format(diff)) assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff) # Save pytorch-model print("Save TensorFlow model to {}".format(tf_dump_path)) tf_model.save_weights(tf_dump_path, save_format="h5") def convert_all_pt_checkpoints_to_tf( args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None, compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False, ): assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory" if args_model_type is None: model_types = list(MODEL_CLASSES.keys()) else: model_types = [args_model_type] for j, model_type in enumerate(model_types, start=1): print("=" * 100) print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type)) print("=" * 100) if model_type not in MODEL_CLASSES: raise ValueError( "Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())) ) config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type] if model_shortcut_names_or_path is None: model_shortcut_names_or_path = list(aws_model_maps.keys()) if config_shortcut_names_or_path is None: config_shortcut_names_or_path = model_shortcut_names_or_path for i, (model_shortcut_name, config_shortcut_name) in enumerate( zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1 ): print("-" * 100) if "-squad" in model_shortcut_name or "-mrpc" in model_shortcut_name or "-mnli" in model_shortcut_name: if not only_convert_finetuned_models: print(" Skipping finetuned checkpoint {}".format(model_shortcut_name)) continue model_type = model_shortcut_name elif only_convert_finetuned_models: print(" Skipping not finetuned checkpoint {}".format(model_shortcut_name)) continue print( " Converting checkpoint {}/{}: {} - model_type {}".format( i, len(aws_config_map), model_shortcut_name, model_type ) ) print("-" * 100) if config_shortcut_name in aws_config_map: config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models) else: config_file = cached_path(config_shortcut_name, force_download=not use_cached_models) if model_shortcut_name in aws_model_maps: model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models) else: model_file = cached_path(model_shortcut_name, force_download=not use_cached_models) if os.path.isfile(model_shortcut_name): model_shortcut_name = "converted_model" convert_pt_checkpoint_to_tf( model_type=model_type, pytorch_checkpoint_path=model_file, config_file=config_file, tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + "-tf_model.h5"), compare_with_pt_model=compare_with_pt_model, ) if remove_cached_files: os.remove(config_file) os.remove(model_file) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file." ) parser.add_argument( "--model_type", default=None, type=str, help="Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format( list(MODEL_CLASSES.keys()) ), ) parser.add_argument( "--pytorch_checkpoint_path", default=None, type=str, help="Path to the PyTorch checkpoint path or shortcut name to download from AWS. " "If not given, will download and convert all the checkpoints from AWS.", ) parser.add_argument( "--config_file", default=None, type=str, help="The config json file corresponding to the pre-trained model. \n" "This specifies the model architecture. If not given and " "--pytorch_checkpoint_path is not given or is a shortcut name" "use the configuration associated to the shortcut name on the AWS", ) parser.add_argument( "--compare_with_pt_model", action="store_true", help="Compare Tensorflow and PyTorch model predictions." ) parser.add_argument( "--use_cached_models", action="store_true", help="Use cached models if possible instead of updating to latest checkpoint versions.", ) parser.add_argument( "--remove_cached_files", action="store_true", help="Remove pytorch models after conversion (save memory when converting in batches).", ) parser.add_argument("--only_convert_finetuned_models", action="store_true", help="Only convert finetuned models.") args = parser.parse_args() # if args.pytorch_checkpoint_path is not None: # convert_pt_checkpoint_to_tf(args.model_type.lower(), # args.pytorch_checkpoint_path, # args.config_file if args.config_file is not None else args.pytorch_checkpoint_path, # args.tf_dump_path, # compare_with_pt_model=args.compare_with_pt_model, # use_cached_models=args.use_cached_models) # else: convert_all_pt_checkpoints_to_tf( args.model_type.lower() if args.model_type is not None else None, args.tf_dump_path, model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None, config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None, compare_with_pt_model=args.compare_with_pt_model, use_cached_models=args.use_cached_models, remove_cached_files=args.remove_cached_files, only_convert_finetuned_models=args.only_convert_finetuned_models, ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_reformer_trax_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert Reformer checkpoint.""" import argparse import logging import pickle import numpy as np import torch from transformers import ReformerConfig, ReformerModelWithLMHead logging.basicConfig(level=logging.INFO) def set_param(torch_layer, weight, bias=None): # set parameter of one layer assert torch_layer.weight.shape == weight.shape, "{} layer.weight does not match".format(torch_layer) torch_layer.weight = torch.nn.Parameter(weight) if bias is not None: assert torch_layer.bias.shape == bias.shape, "{} layer.bias does not match".format(torch_layer) torch_layer.bias = torch.nn.Parameter(bias) def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size): # set torch weights for 1-to-1 comparison np_query_key = np.asarray(weights[0]) np_value = np.asarray(weights[1]) np_dense = np.asarray(weights[2]) set_param( torch_layer.self_attention.query_key, torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1), ) def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size): # set torch weights for 1-to-1 comparison np_query = np.asarray(weights[0]) np_key = np.asarray(weights[1]) np_value = np.asarray(weights[2]) np_dense = np.asarray(weights[3]) set_param( torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size), ) set_param( torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1), ) def set_block_weights_in_torch(weights, torch_block, hidden_size): # layernorm 1 layer_norm_1 = weights[0][0][0] layer_norm_1_weight = np.asarray(layer_norm_1[0]) layer_norm_1_bias = np.asarray(layer_norm_1[1]) set_param( torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias), ) # lsh weights + output attn_weights = weights[0][1] if len(attn_weights) < 4: set_layer_weights_in_torch_lsh(attn_weights, torch_block.attention, hidden_size) else: set_layer_weights_in_torch_local(attn_weights, torch_block.attention, hidden_size) # intermediate weighs intermediate_weights = weights[2][0][1][2] # Chunked Feed Forward if len(intermediate_weights) == 4: intermediate_weights = intermediate_weights[2] # layernorm 2 layer_norm_2_weight = np.asarray(intermediate_weights[0][0]) layer_norm_2_bias = np.asarray(intermediate_weights[0][1]) set_param( torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias), ) # intermediate dense inter_dense_weight = np.asarray(intermediate_weights[1][0]) inter_dense_bias = np.asarray(intermediate_weights[1][1]) set_param( torch_block.feed_forward.dense.dense, torch.tensor(inter_dense_weight).transpose(0, 1).contiguous(), torch.tensor(inter_dense_bias), ) # intermediate out out_dense_weight = np.asarray(intermediate_weights[4][0]) out_dense_bias = np.asarray(intermediate_weights[4][1]) set_param( torch_block.feed_forward.output.dense, torch.tensor(out_dense_weight).transpose(0, 1).contiguous(), torch.tensor(out_dense_bias), ) def set_model_weights_in_torch(weights, torch_model, hidden_size): # reformer model torch_model_reformer = torch_model.reformer # word embeds word_embeddings = np.asarray(weights[1]) set_param( torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings), ) if isinstance(weights[3], tuple): position_embeddings = torch_model_reformer.embeddings.position_embeddings for emb_idx in range(len(position_embeddings.weights)): emb_weights = np.asarray(weights[3][emb_idx][0]) assert position_embeddings.weights[emb_idx].shape == emb_weights.shape, "{} emb does not match".format( position_embeddings[emb_idx] ) position_embeddings.weights[emb_idx] = torch.nn.Parameter(torch.tensor(emb_weights)) trax_layer_weights = weights[5] assert len(torch_model_reformer.encoder.layers) * 4 == len( trax_layer_weights ), "HF and trax model do not have the same number of layers" for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers): block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)] set_block_weights_in_torch(block_weights, layer, hidden_size) # output layer norm layer_norm_out_weight = np.asarray(weights[7][0]) layer_norm_out_bias = np.asarray(weights[7][1]) set_param( torch_model_reformer.encoder.layer_norm, torch.tensor(layer_norm_out_weight), torch.tensor(layer_norm_out_bias), ) # output embeddings output_embed_weights = np.asarray(weights[9][0]) output_embed_bias = np.asarray(weights[9][1]) set_param( torch_model.lm_head.decoder, torch.tensor(output_embed_weights).transpose(0, 1).contiguous(), torch.tensor(output_embed_bias), ) def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = ReformerConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = ReformerModelWithLMHead(config) with open(trax_model_pkl_path, "rb") as f: model_weights = pickle.load(f)["weights"] set_model_weights_in_torch(model_weights, model, config.hidden_size) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--trax_model_pkl_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained Reformer model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_trax_checkpoint_to_pytorch(args.trax_model_pkl_path, args.config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_roberta_original_pytorch_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert RoBERTa checkpoint.""" import argparse import logging import pathlib import fairseq import torch from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.modules import TransformerSentenceEncoderLayer from packaging import version from transformers.modeling_bert import BertIntermediate, BertLayer, BertOutput, BertSelfAttention, BertSelfOutput from transformers.modeling_roberta import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification if version.parse(fairseq.__version__) < version.parse("0.9.0"): raise Exception("requires fairseq >= 0.9.0") logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) SAMPLE_TEXT = "Hello world! cécé herlolip" def convert_roberta_checkpoint_to_pytorch( roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool ): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout roberta_sent_encoder = roberta.model.decoder.sentence_encoder config = RobertaConfig( vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.args.num_classes print("Our BERT config:", config) model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] # self attention self_attn: BertSelfAttention = layer.attention.self assert ( roberta_layer.self_attn.k_proj.weight.data.shape == roberta_layer.self_attn.q_proj.weight.data.shape == roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size((config.hidden_size, config.hidden_size)) ) self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias # end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.decoder.bias = roberta.model.decoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) parser.add_argument( "--classification_head", action="store_true", help="Whether to convert a final classification head." ) args = parser.parse_args() convert_roberta_checkpoint_to_pytorch( args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_t5_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The T5 authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert T5 checkpoint.""" import argparse import logging import torch from transformers import T5Config, T5Model, load_tf_weights_in_t5 logging.basicConfig(level=logging.INFO) def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = T5Config.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = T5Model(config) # Load weights from tf checkpoint load_tf_weights_in_t5(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained T5 model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert Transformer XL checkpoint and datasets.""" import argparse import logging import os import pickle import sys import torch import transformers.tokenization_transfo_xl as data_utils from transformers import ( CONFIG_NAME, WEIGHTS_NAME, TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl, ) from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES logging.basicConfig(level=logging.INFO) # We do this to be able to load python 2 datasets pickles # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 data_utils.Vocab = data_utils.TransfoXLTokenizer data_utils.Corpus = data_utils.TransfoXLCorpus sys.modules["data_utils"] = data_utils sys.modules["vocabulary"] = data_utils def convert_transfo_xl_checkpoint_to_pytorch( tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file ): if transfo_xl_dataset_file: # Convert a pre-processed corpus (see original TensorFlow repo) with open(transfo_xl_dataset_file, "rb") as fp: corpus = pickle.load(fp, encoding="latin1") # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) corpus_vocab_dict = corpus.vocab.__dict__ torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) corpus_dict_no_vocab = corpus.__dict__ corpus_dict_no_vocab.pop("vocab", None) pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME print("Save dataset to {}".format(pytorch_dataset_dump_path)) torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) if tf_checkpoint_path: # Convert a pre-trained TensorFlow model config_path = os.path.abspath(transfo_xl_config_file) tf_path = os.path.abspath(tf_checkpoint_path) print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) # Initialise PyTorch model if transfo_xl_config_file == "": config = TransfoXLConfig() else: config = TransfoXLConfig.from_json_file(transfo_xl_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = TransfoXLLMHeadModel(config) model = load_tf_weights_in_transfo_xl(model, config, tf_path) # Save pytorch-model pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the folder to store the PyTorch model or dataset/vocab.", ) parser.add_argument( "--tf_checkpoint_path", default="", type=str, help="An optional path to a TensorFlow checkpoint path to be converted.", ) parser.add_argument( "--transfo_xl_config_file", default="", type=str, help="An optional config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.", ) parser.add_argument( "--transfo_xl_dataset_file", default="", type=str, help="An optional dataset file to be converted in a vocabulary.", ) args = parser.parse_args() convert_transfo_xl_checkpoint_to_pytorch( args.tf_checkpoint_path, args.transfo_xl_config_file, args.pytorch_dump_folder_path, args.transfo_xl_dataset_file, ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_xlm_original_pytorch_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert OpenAI GPT checkpoint.""" import argparse import json import logging import numpy import torch from transformers import CONFIG_NAME, WEIGHTS_NAME from transformers.tokenization_xlm import VOCAB_FILES_NAMES logging.basicConfig(level=logging.INFO) def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): # Load checkpoint chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") state_dict = chkpt["model"] # We have the base model one level deeper than the original XLM repository two_levels_state_dict = {} for k, v in state_dict.items(): if "pred_layer" in k: two_levels_state_dict[k] = v else: two_levels_state_dict["transformer." + k] = v config = chkpt["params"] config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) vocab = chkpt["dico_word2id"] vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(two_levels_state_dict, pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(json.dumps(config, indent=2) + "\n") print("Save vocab file to {}".format(pytorch_config_dump_path)) with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: f.write(json.dumps(vocab, indent=2) + "\n") if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." ) args = parser.parse_args() convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/convert_xlnet_original_tf_checkpoint_to_pytorch.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert BERT checkpoint.""" import argparse import logging import os import torch from transformers import ( CONFIG_NAME, WEIGHTS_NAME, XLNetConfig, XLNetForQuestionAnswering, XLNetForSequenceClassification, XLNetLMHeadModel, load_tf_weights_in_xlnet, ) GLUE_TASKS_NUM_LABELS = { "cola": 2, "mnli": 3, "mrpc": 2, "sst-2": 2, "sts-b": 1, "qqp": 2, "qnli": 2, "rte": 2, "wnli": 2, } logging.basicConfig(level=logging.INFO) def convert_xlnet_checkpoint_to_pytorch( tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None ): # Initialise PyTorch model config = XLNetConfig.from_json_file(bert_config_file) finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" if finetuning_task in GLUE_TASKS_NUM_LABELS: print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) config.finetuning_task = finetuning_task config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] model = XLNetForSequenceClassification(config) elif "squad" in finetuning_task: config.finetuning_task = finetuning_task model = XLNetForQuestionAnswering(config) else: model = XLNetLMHeadModel(config) # Load weights from tf checkpoint load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) # Save pytorch-model pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string()) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) parser.add_argument( "--xlnet_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained XLNet model. \n" "This specifies the model architecture.", ) parser.add_argument( "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the folder to store the PyTorch model or dataset/vocab.", ) parser.add_argument( "--finetuning_task", default=None, type=str, help="Name of a task on which the XLNet TensorFloaw model was fine-tuned", ) args = parser.parse_args() print(args) convert_xlnet_checkpoint_to_pytorch( args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from .metrics import is_sklearn_available from .processors import ( DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor, SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels, squad_convert_examples_to_features, xnli_output_modes, xnli_processors, xnli_tasks_num_labels, ) if is_sklearn_available(): from .metrics import glue_compute_metrics, xnli_compute_metrics ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/data_collator.py ================================================ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Dict, List, NewType, Tuple import torch from torch.nn.utils.rnn import pad_sequence import random import numpy as np from ..tokenization_utils import PreTrainedTokenizer class DataCollator(ABC): """ A `DataCollator` is responsible for batching and pre-processing samples of data as requested by the training loop. """ @abstractmethod def collate_batch(self) -> Dict[str, torch.Tensor]: """ Take a list of samples from a Dataset and collate them into a batch. Returns: A dictionary of tensors """ pass InputDataClass = NewType("InputDataClass", Any) @dataclass class DefaultDataCollator(DataCollator): """ Very simple data collator that: - simply collates batches of dict-like objects - Performs special handling for potential keys named: - `label`: handles a single value (int or float) per object - `label_ids`: handles a list of values per object - does not do any additional preprocessing i.e., Property names of the input object will be used as corresponding inputs to the model. See glue and ner for example of how it's useful. """ def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]: # In this method we'll make the assumption that all `features` in the batch # have the same attributes. # So we will look at the first element as a proxy for what attributes exist # on the whole batch. first = features[0] # Special handling for labels. # Ensure that tensor is created with the correct type # (it should be automatically the case, but let's make sure of it.) if hasattr(first, "label") and first.label is not None: if type(first.label) is int: labels = torch.tensor([f.label for f in features], dtype=torch.long) else: labels = torch.tensor([f.label for f in features], dtype=torch.float) batch = {"labels": labels} elif hasattr(first, "label_ids") and first.label_ids is not None: if type(first.label_ids[0]) is int: labels = torch.tensor([f.label_ids for f in features], dtype=torch.long) else: labels = torch.tensor([f.label_ids for f in features], dtype=torch.float) batch = {"labels": labels} else: batch = {} # Handling of all other possible attributes. # Again, we will use the first element to figure out which key/values are not None for this model. for k, v in vars(first).items(): if k not in ("label", "label_ids") and v is not None and not isinstance(v, str): batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long) return batch @dataclass class DataCollatorForLanguageModeling(DataCollator): """ Data collator used for language modeling. - collates batches of tensors, honoring their tokenizer's pad_token - preprocesses batches for masked language modeling """ tokenizer: PreTrainedTokenizer mlm: bool = True mlm_probability: float = 0.15 def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]: batch = self._tensorize_batch(examples) if self.mlm: inputs, labels = self.mask_tokens7(batch) return {"input_ids": inputs, "labels": labels} else: return {"input_ids": batch, "labels": batch} def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor: length_of_first = examples[0].size(0) are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) if are_tensors_same_length: return torch.stack(examples, dim=0) else: if self.tokenizer._pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({self.tokenizer.__class__.__name__}) does not have one." ) return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels def mask_tokens2(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): if random.random() > 0.85: if random.random() > 0.2: inputs[i][j] = self.tokenizer.mask_token_id elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) else: pass else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens3(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) probability_matrix = probability_matrix.numpy() labels = labels.numpy() covered = set() for i in range(len(probability_matrix)): for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15) and (i,j) not in covered: if random.random() > 0.85: if random.random() > 0.2: if random.random() > 0.85: for k in range(j,min(j+5,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) elif random.random() > 0.7647: for k in range(j,min(j+4,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) elif random.random() > 0.5384: for k in range(j,min(j+3,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) elif random.random() > 0.42857: for k in range(j,min(j+2,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) else: inputs[i][j] = self.tokenizer.mask_token_id covered.add((i,j)) elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) else: pass else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens4(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) inputs = inputs.numpy() ids = [i for i in range(len(inputs))] random.shuffle(ids) inputs = inputs[ids] inputs = torch.from_numpy(inputs) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) total_token = 0 for i in range(len(probability_matrix)): for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 cur_token = 0 probability_matrix = probability_matrix.numpy() labels = labels.numpy() covered = set() ngramFlag = True for i in range(len(probability_matrix)): if cur_token > total_token * 0.03: ngramFlag = False for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15) and (i,j) not in covered: if random.random() > 0.85: if random.random() > 0.2: if random.random() > 0.9 and ngramFlag: for k in range(j,min(j+4,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) cur_token += 1 elif random.random() > 0.222 and ngramFlag: for k in range(j,min(j+3,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) cur_token += 1 elif random.random() > 0.42857 and ngramFlag: for k in range(j,min(j+2,len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i,k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i,k)) cur_token += 1 else: inputs[i][j] = self.tokenizer.mask_token_id covered.add((i,j)) cur_token += 1 elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) cur_token += 1 else: pass else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens5(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) covered = set() pvals = [0.4, 0.3, 0.2, 0.1] ngrams = np.arange(1, 5, dtype=np.int64) probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): cur_token = 0 total_token = 0 for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 choose = random.randint(0, 1) if choose == 0: startIndex = 0 endIndex = np.argwhere(inputs[i] == np.float32(2))[-1][0] elif choose == 1: startIndex = np.argwhere(inputs[i] == np.float32(2))[-1][0] endIndex = np.argwhere(inputs[i] == np.float32(3))[-1][0] valid_j = [index for index in range(startIndex, endIndex + 1)] for j in range(len(probability_matrix[0])): if cur_token < total_token * 0.15: if probability_matrix[i][j] == np.float32(0.15): n = np.random.choice(ngrams, p=pvals) for k in range(n): if j + k >= len(probability_matrix[0]): break if (i, j+k) in covered: continue if j+k in valid_j: if random.random() > 0.7: if random.random() > 0.2: if probability_matrix[i][j+k] == np.float32(0.15): inputs[i][j+k] = self.tokenizer.mask_token_id covered.add((i, j + k)) cur_token += 1 elif random.random() > 0.5: if probability_matrix[i][j + k] == np.float32(0.15): inputs[i][j+k] = random.randint(5, len(self.tokenizer) - 1) covered.add((i, j + k)) cur_token += 1 else: if probability_matrix[i][j + k] == np.float32(0.15): covered.add((i, j + k)) cur_token += 1 else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens6(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) covered = set() probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): cur_token = 0 total_token = 0 for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 for j in range(len(probability_matrix[0])): if cur_token > total_token*0.15: break if probability_matrix[i][j] == np.float32(0.15): if random.random() > 0.85: if random.random() > 0.2: if random.random() > 0.9: for k in range(j, min(j + 4, len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i, k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i, k)) cur_token += 1 elif random.random() > 0.222: for k in range(j, min(j + 3, len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i, k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i, k)) cur_token += 1 elif random.random() > 0.42857: for k in range(j, min(j + 2, len(probability_matrix[0]))): if probability_matrix[i][k] == np.float32(0.15) and (i, k) not in covered: inputs[i][k] = self.tokenizer.mask_token_id covered.add((i, k)) cur_token += 1 else: inputs[i][j] = self.tokenizer.mask_token_id covered.add((i, j)) cur_token += 1 elif random.random() > 0.5: inputs[i][j] = random.randint(5, len(self.tokenizer) - 1) cur_token += 1 else: cur_token += 1 else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels def mask_tokens7(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() inputs = inputs.numpy() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, self.mlm_probability) special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) covered = set() ngrams = np.arange(1, 3 + 1, dtype=np.int64) pvals = 1. / np.arange(1, 3 + 1) pvals /= pvals.sum(keepdims=True) probability_matrix = probability_matrix.numpy() labels = labels.numpy() for i in range(len(probability_matrix)): cur_token = 0 total_token = 0 for j in range(len(probability_matrix[0])): if probability_matrix[i][j] == np.float32(0.15): total_token += 1 for j in range(len(probability_matrix[0])): if cur_token <= total_token * 0.15: n = np.random.choice(ngrams, p=pvals) if probability_matrix[i][j] == np.float32(0.15): for k in range(n): if j + k >= len(probability_matrix[0]): break if (i, j+k) in covered: continue if random.random() > 0.85: if random.random() > 0.2: if probability_matrix[i][j+k] == np.float32(0.15): inputs[i][j+k] = self.tokenizer.mask_token_id covered.add((i, j + k)) cur_token += 1 elif random.random() > 0.5: if probability_matrix[i][j + k] == np.float32(0.15): inputs[i][j+k] = random.randint(5, len(self.tokenizer) - 1) covered.add((i, j + k)) cur_token += 1 else: if probability_matrix[i][j + k] == np.float32(0.15): covered.add((i, j + k)) cur_token += 1 else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) else: labels[i][j] = np.float32(-100) inputs = torch.from_numpy(inputs) labels = torch.from_numpy(labels) return inputs, labels ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/datasets/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from .glue import GlueDataset, GlueDataTrainingArguments from .language_modeling import LineByLineTextDataset, TextDataset ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/datasets/glue.py ================================================ import logging import os import time from dataclasses import dataclass, field from enum import Enum from typing import List, Optional, Union import torch from filelock import FileLock from torch.utils.data.dataset import Dataset from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_xlm_roberta import XLMRobertaTokenizer from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors from ..processors.utils import InputFeatures logger = logging.getLogger(__name__) @dataclass class GlueDataTrainingArguments: """ Arguments pertaining to what data we are going to input our model for training and eval. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())}) data_dir: str = field( metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."} ) max_seq_length: int = field( default=128, metadata={ "help": "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded." }, ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} ) def __post_init__(self): self.task_name = self.task_name.lower() class Split(Enum): train = "train" dev = "dev" test = "test" class GlueDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ args: GlueDataTrainingArguments output_mode: str features: List[InputFeatures] def __init__( self, args: GlueDataTrainingArguments, tokenizer: PreTrainedTokenizer, limit_length: Optional[int] = None, mode: Union[str, Split] = Split.train, ): self.args = args self.processor = glue_processors[args.task_name]() self.output_mode = glue_output_modes[args.task_name] if isinstance(mode, str): try: mode = Split[mode] except KeyError: raise KeyError("mode is not a valid split name") # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name, ), ) label_list = self.processor.get_labels() if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, ): # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] self.label_list = label_list # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not args.overwrite_cache: start = time.time() self.features = torch.load(cached_features_file) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {args.data_dir}") if mode == Split.dev: examples = self.processor.get_dev_examples(args.data_dir) elif mode == Split.test: examples = self.processor.get_test_examples(args.data_dir) else: examples = self.processor.get_train_examples(args.data_dir) if limit_length is not None: examples = examples[:limit_length] self.features = glue_convert_examples_to_features( examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=self.output_mode, ) start = time.time() torch.save(self.features, cached_features_file) # ^ This seems to take a lot of time so I want to investigate why and how we can improve. logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start ) def __len__(self): return len(self.features) def __getitem__(self, i) -> InputFeatures: return self.features[i] def get_labels(self): return self.label_list ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/datasets/language_modeling.py ================================================ import logging import os import pickle import time import torch from filelock import FileLock from torch.utils.data.dataset import Dataset from ...tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) class TextDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ def __init__( self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, ): assert os.path.isfile(file_path) block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), ) # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {directory}") self.examples = [] with open(file_path, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size self.examples.append( tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]) ) # Note that we are losing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start ) def __len__(self): return len(self.examples) def __getitem__(self, i) -> torch.Tensor: return torch.tensor(self.examples[i], dtype=torch.long) class LineByLineTextDataset(Dataset): """ This will be superseded by a framework-agnostic approach soon. """ def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info("Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) self.examples = batch_encoding["input_ids"] def __len__(self): return len(self.examples) def __getitem__(self, i) -> torch.Tensor: return torch.tensor(self.examples[i], dtype=torch.long) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/metrics/__init__.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. try: from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef, f1_score _has_sklearn = True except (AttributeError, ImportError): _has_sklearn = False def is_sklearn_available(): return _has_sklearn if _has_sklearn: def simple_accuracy(preds, labels): return (preds == labels).mean() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] return { "pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2, } def glue_compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "cola": return {"mcc": matthews_corrcoef(labels, preds)} elif task_name == "sst-2": return {"acc": simple_accuracy(preds, labels)} elif task_name == "mrpc": return acc_and_f1(preds, labels) elif task_name == "sts-b": return pearson_and_spearman(preds, labels) elif task_name == "qqp": return acc_and_f1(preds, labels) elif task_name == "mnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "mnli-mm": return {"acc": simple_accuracy(preds, labels)} elif task_name == "qnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "rte": return {"acc": simple_accuracy(preds, labels)} elif task_name == "wnli": return {"acc": simple_accuracy(preds, labels)} elif task_name == "hans": return {"acc": simple_accuracy(preds, labels)} else: raise KeyError(task_name) def xnli_compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "xnli": return {"acc": simple_accuracy(preds, labels)} else: raise KeyError(task_name) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/metrics/squad_metrics.py ================================================ """ Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted probability that a question is unanswerable. """ import collections import json import logging import math import re import string from transformers.tokenization_bert import BasicTokenizer logger = logging.getLogger(__name__) def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) return re.sub(regex, " ", text) def white_space_fix(text): return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) def get_tokens(s): if not s: return [] return normalize_answer(s).split() def compute_exact(a_gold, a_pred): return int(normalize_answer(a_gold) == normalize_answer(a_pred)) def compute_f1(a_gold, a_pred): gold_toks = get_tokens(a_gold) pred_toks = get_tokens(a_pred) common = collections.Counter(gold_toks) & collections.Counter(pred_toks) num_same = sum(common.values()) if len(gold_toks) == 0 or len(pred_toks) == 0: # If either is no-answer, then F1 is 1 if they agree, 0 otherwise return int(gold_toks == pred_toks) if num_same == 0: return 0 precision = 1.0 * num_same / len(pred_toks) recall = 1.0 * num_same / len(gold_toks) f1 = (2 * precision * recall) / (precision + recall) return f1 def get_raw_scores(examples, preds): """ Computes the exact and f1 scores from the examples and the model predictions """ exact_scores = {} f1_scores = {} for example in examples: qas_id = example.qas_id gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])] if not gold_answers: # For unanswerable questions, only correct answer is empty string gold_answers = [""] if qas_id not in preds: print("Missing prediction for %s" % qas_id) continue prediction = preds[qas_id] exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers) f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers) return exact_scores, f1_scores def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): new_scores = {} for qid, s in scores.items(): pred_na = na_probs[qid] > na_prob_thresh if pred_na: new_scores[qid] = float(not qid_to_has_ans[qid]) else: new_scores[qid] = s return new_scores def make_eval_dict(exact_scores, f1_scores, qid_list=None): if not qid_list: total = len(exact_scores) return collections.OrderedDict( [ ("exact", 100.0 * sum(exact_scores.values()) / total), ("f1", 100.0 * sum(f1_scores.values()) / total), ("total", total), ] ) else: total = len(qid_list) return collections.OrderedDict( [ ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total), ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total), ("total", total), ] ) def merge_eval(main_eval, new_eval, prefix): for k in new_eval: main_eval["%s_%s" % (prefix, k)] = new_eval[k] def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) cur_score = num_no_ans best_score = cur_score best_thresh = 0.0 qid_list = sorted(na_probs, key=lambda k: na_probs[k]) for i, qid in enumerate(qid_list): if qid not in scores: continue if qid_to_has_ans[qid]: diff = scores[qid] else: if preds[qid]: diff = -1 else: diff = 0 cur_score += diff if cur_score > best_score: best_score = cur_score best_thresh = na_probs[qid] has_ans_score, has_ans_cnt = 0, 0 for qid in qid_list: if not qid_to_has_ans[qid]: continue has_ans_cnt += 1 if qid not in scores: continue has_ans_score += scores[qid] return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) main_eval["best_exact"] = best_exact main_eval["best_exact_thresh"] = exact_thresh main_eval["best_f1"] = best_f1 main_eval["best_f1_thresh"] = f1_thresh main_eval["has_ans_exact"] = has_ans_exact main_eval["has_ans_f1"] = has_ans_f1 def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) cur_score = num_no_ans best_score = cur_score best_thresh = 0.0 qid_list = sorted(na_probs, key=lambda k: na_probs[k]) for _, qid in enumerate(qid_list): if qid not in scores: continue if qid_to_has_ans[qid]: diff = scores[qid] else: if preds[qid]: diff = -1 else: diff = 0 cur_score += diff if cur_score > best_score: best_score = cur_score best_thresh = na_probs[qid] return 100.0 * best_score / len(scores), best_thresh def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) main_eval["best_exact"] = best_exact main_eval["best_exact_thresh"] = exact_thresh main_eval["best_f1"] = best_f1 main_eval["best_f1_thresh"] = f1_thresh def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0): qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples} has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer] no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer] if no_answer_probs is None: no_answer_probs = {k: 0.0 for k in preds} exact, f1 = get_raw_scores(examples, preds) exact_threshold = apply_no_ans_threshold( exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold ) f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) evaluation = make_eval_dict(exact_threshold, f1_threshold) if has_answer_qids: has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids) merge_eval(evaluation, has_ans_eval, "HasAns") if no_answer_qids: no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids) merge_eval(evaluation, no_ans_eval, "NoAns") if no_answer_probs: find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer) return evaluation def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text def _get_best_indexes(logits, n_best_size): """Get the n-best logits from a list.""" index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) best_indexes = [] for i in range(len(index_and_score)): if i >= n_best_size: break best_indexes.append(index_and_score[i][0]) return best_indexes def _compute_softmax(scores): """Compute softmax probability over raw logits.""" if not scores: return [] max_score = None for score in scores: if max_score is None or score > max_score: max_score = score exp_scores = [] total_sum = 0.0 for score in scores: x = math.exp(score - max_score) exp_scores.append(x) total_sum += x probs = [] for score in exp_scores: probs.append(score / total_sum) return probs def compute_predictions_logits( all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, version_2_with_negative, null_score_diff_threshold, tokenizer, ): """Write final predictions to the json file and log-odds of null if needed.""" if output_prediction_file: logger.info(f"Writing predictions to: {output_prediction_file}") if output_nbest_file: logger.info(f"Writing nbest to: {output_nbest_file}") if output_null_log_odds_file and version_2_with_negative: logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}") example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] ) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive min_null_feature_index = 0 # the paragraph slice with min null score null_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: feature_null_score = result.start_logits[0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index null_start_logit = result.start_logits[0] null_end_logit = result.end_logits[0] for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if start_index not in feature.token_to_orig_map: continue if end_index not in feature.token_to_orig_map: continue if not feature.token_is_max_context.get(start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], end_logit=result.end_logits[end_index], ) ) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( feature_index=min_null_feature_index, start_index=0, end_index=0, start_logit=null_start_logit, end_logit=null_end_logit, ) ) prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_logit", "end_logit"] ) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # tok_text = " ".join(tok_tokens) # # # De-tokenize WordPieces that have been split off. # tok_text = tok_text.replace(" ##", "") # tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True else: final_text = "" seen_predictions[final_text] = True nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_logit + entry.end_logit) if not best_non_null_entry: if entry.text: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_logit"] = entry.start_logit output["end_logit"] = entry.end_logit nbest_json.append(output) assert len(nbest_json) >= 1 if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" else: all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json if output_prediction_file: with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") if output_nbest_file: with open(output_nbest_file, "w") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") if output_null_log_odds_file and version_2_with_negative: with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") return all_predictions def compute_predictions_log_probs( all_examples, all_features, all_results, n_best_size, max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, version_2_with_negative, tokenizer, verbose_logging, ): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"] ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] ) logger.info("Writing predictions to: %s", output_prediction_file) # logger.info("Writing nbest to: %s" % (output_nbest_file)) example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] cur_null_score = result.cls_logits # if we could have irrelevant answers, get the min score of irrelevant score_null = min(score_null, cur_null_score) for i in range(start_n_top): for j in range(end_n_top): start_log_prob = result.start_logits[i] start_index = result.start_top_index[i] j_index = i * end_n_top + j end_log_prob = result.end_logits[j_index] end_index = result.end_top_index[j_index] # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= feature.paragraph_len - 1: continue if end_index >= feature.paragraph_len - 1: continue if not feature.token_is_max_context.get(start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, end_log_prob=end_log_prob, ) ) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True ) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] # XLNet un-tokenizer # Let's keep it simple for now and see if we need all this later. # # tok_start_to_orig_index = feature.tok_start_to_orig_index # tok_end_to_orig_index = feature.tok_end_to_orig_index # start_orig_pos = tok_start_to_orig_index[pred.start_index] # end_orig_pos = tok_end_to_orig_index[pred.end_index] # paragraph_text = example.paragraph_text # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) if hasattr(tokenizer, "do_lower_case"): do_lower_case = tokenizer.do_lower_case else: do_lower_case = tokenizer.do_lowercase_and_remove_accent final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob) ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6)) total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_log_prob + entry.end_log_prob) if not best_non_null_entry: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_log_prob"] = entry.start_log_prob output["end_log_prob"] = entry.end_log_prob nbest_json.append(output) assert len(nbest_json) >= 1 assert best_non_null_entry is not None score_diff = score_null scores_diff_json[example.qas_id] = score_diff # note(zhiliny): always predict best_non_null_entry # and the evaluation script will search for the best threshold all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") with open(output_nbest_file, "w") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") if version_2_with_negative: with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") return all_predictions ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/processors/__init__.py ================================================ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/processors/glue.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ GLUE processors and helpers """ import logging import os from enum import Enum from typing import List, Optional, Union from ...file_utils import is_tf_available from ...tokenization_utils import PreTrainedTokenizer from .utils import DataProcessor, InputExample, InputFeatures if is_tf_available(): import tensorflow as tf logger = logging.getLogger(__name__) def glue_convert_examples_to_features( examples: Union[List[InputExample], "tf.data.Dataset"], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length. Defaults to the tokenizer's max_len task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ if is_tf_available() and isinstance(examples, tf.data.Dataset): if task is None: raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.") return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) return _glue_convert_examples_to_features( examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode ) if is_tf_available(): def _tf_glue_convert_examples_to_features( examples: tf.data.Dataset, tokenizer: PreTrainedTokenizer, task=str, max_length: Optional[int] = None, ) -> tf.data.Dataset: """ Returns: A ``tf.data.Dataset`` containing the task-specific features. """ processor = glue_processors[task]() examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples] features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, ex.label, ) return tf.data.Dataset.from_generator( gen, ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), ) def _glue_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features class OutputMode(Enum): classification = "classification" regression = "regression" class MrpcProcessor(DataProcessor): """Processor for the MRPC data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line[3] text_b = line[4] label = None if set_type == "test" else line[0] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class MnliProcessor(DataProcessor): """Processor for the MultiNLI data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["premise"].numpy().decode("utf-8"), tensor_dict["hypothesis"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched") def get_labels(self): """See base class.""" return ["contradiction", "entailment", "neutral"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[8] text_b = line[9] label = None if set_type.startswith("test") else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class MnliMismatchedProcessor(MnliProcessor): """Processor for the MultiNLI Mismatched data set (GLUE version).""" def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched") class ColaProcessor(DataProcessor): """Processor for the CoLA data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence"].numpy().decode("utf-8"), None, str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" test_mode = set_type == "test" if test_mode: lines = lines[1:] text_index = 1 if test_mode else 3 examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = line[text_index] label = None if test_mode else line[1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples class Sst2Processor(DataProcessor): """Processor for the SST-2 data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence"].numpy().decode("utf-8"), None, str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] text_index = 1 if set_type == "test" else 0 for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line[text_index] label = None if set_type == "test" else line[1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples class StsbProcessor(DataProcessor): """Processor for the STS-B data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return [None] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[7] text_b = line[8] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class QqpProcessor(DataProcessor): """Processor for the QQP data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["question1"].numpy().decode("utf-8"), tensor_dict["question2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" test_mode = set_type == "test" q1_index = 1 if test_mode else 3 q2_index = 2 if test_mode else 4 examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) try: text_a = line[q1_index] text_b = line[q2_index] label = None if test_mode else line[5] except IndexError: continue examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class QnliProcessor(DataProcessor): """Processor for the QNLI data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["question"].numpy().decode("utf-8"), tensor_dict["sentence"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["entailment", "not_entailment"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class RteProcessor(DataProcessor): """Processor for the RTE data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["entailment", "not_entailment"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class WnliProcessor(DataProcessor): """Processor for the WNLI data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), ) def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = None if set_type == "test" else line[-1] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples glue_tasks_num_labels = { "cola": 2, "mnli": 3, "mrpc": 2, "sst-2": 2, "sts-b": 1, "qqp": 2, "qnli": 2, "rte": 2, "wnli": 2, } glue_processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } glue_output_modes = { "cola": "classification", "mnli": "classification", "mnli-mm": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/processors/squad.py ================================================ import json import logging import os from functools import partial from multiprocessing import Pool, cpu_count import numpy as np from tqdm import tqdm from ...file_utils import is_tf_available, is_torch_available from ...tokenization_bert import whitespace_tokenize from .utils import DataProcessor if is_torch_available(): import torch from torch.utils.data import TensorDataset if is_tf_available(): import tensorflow as tf logger = logging.getLogger(__name__) def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) if text_span == tok_answer_text: return (new_start, new_end) return (input_start, input_end) def _check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" best_score = None best_span_index = None for (span_index, doc_span) in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue if position > end: continue num_left_context = position - doc_span.start num_right_context = end - position score = min(num_left_context, num_right_context) + 0.01 * doc_span.length if best_score is None or score > best_score: best_score = score best_span_index = span_index return cur_span_index == best_span_index def _new_check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" # if len(doc_spans) == 1: # return True best_score = None best_span_index = None for (span_index, doc_span) in enumerate(doc_spans): end = doc_span["start"] + doc_span["length"] - 1 if position < doc_span["start"]: continue if position > end: continue num_left_context = position - doc_span["start"] num_right_context = end - position score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"] if best_score is None or score > best_score: best_score = score best_span_index = span_index return cur_span_index == best_span_index def _is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text ) spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence ) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) ) non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict: break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) index = ( j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j ) spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens :] = 0 else: p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True) ).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = example.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( SquadFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], start_position=start_position, end_position=end_position, is_impossible=span_is_impossible, qas_id=example.qas_id, ) ) return features def squad_convert_example_to_features_init(tokenizer_for_convert): global tokenizer tokenizer = tokenizer_for_convert def squad_convert_examples_to_features( examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1, tqdm_enabled=True, ): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. Args: examples: list of :class:`~transformers1.data.processors.squad.SquadExample` tokenizer: an instance of a child of :class:`~transformers1.PreTrainedTokenizer` max_seq_length: The maximum sequence length of the inputs. doc_stride: The stride used when the context is too large and is split across several features. max_query_length: The maximum length of the query. is_training: whether to create features for model evaluation or model training. return_dataset: Default False. Either 'pt' or 'tf'. if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset threads: multiple processing threadsa-smi Returns: list of :class:`~transformers1.data.processors.squad.SquadFeatures` Example:: processor = SquadV2Processor() examples = processor.get_dev_examples(data_dir) features = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, ) """ # Defining helper methods features = [] threads = min(threads, cpu_count()) with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: annotate_ = partial( squad_convert_example_to_features, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training, ) features = list( tqdm( p.imap(annotate_, examples, chunksize=32), total=len(examples), desc="convert squad examples to features", disable=not tqdm_enabled, ) ) new_features = [] unique_id = 1000000000 example_index = 0 for example_features in tqdm( features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled ): if not example_features: continue for example_feature in example_features: example_feature.example_index = example_index example_feature.unique_id = unique_id new_features.append(example_feature) unique_id += 1 example_index += 1 features = new_features del new_features if return_dataset == "pt": if not is_torch_available(): raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float) if not is_training: all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask ) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask, all_is_impossible, ) return features, dataset elif return_dataset == "tf": if not is_tf_available(): raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.") def gen(): for i, ex in enumerate(features): yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, "feature_index": i, "qas_id": ex.qas_id, }, { "start_position": ex.start_position, "end_position": ex.end_position, "cls_index": ex.cls_index, "p_mask": ex.p_mask, "is_impossible": ex.is_impossible, }, ) # Why have we split the batch into a tuple? PyTorch just has a list of tensors. train_types = ( { "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32, "feature_index": tf.int64, "qas_id": tf.string, }, { "start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32, "is_impossible": tf.int32, }, ) train_shapes = ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), "feature_index": tf.TensorShape([]), "qas_id": tf.TensorShape([]), }, { "start_position": tf.TensorShape([]), "end_position": tf.TensorShape([]), "cls_index": tf.TensorShape([]), "p_mask": tf.TensorShape([None]), "is_impossible": tf.TensorShape([]), }, ) return tf.data.Dataset.from_generator(gen, train_types, train_shapes) else: return features class SquadProcessor(DataProcessor): """ Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively. """ train_file = None dev_file = None def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): if not evaluate: answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8") answer_start = tensor_dict["answers"]["answer_start"][0].numpy() answers = [] else: answers = [ {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")} for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"]) ] answer = None answer_start = None return SquadExample( qas_id=tensor_dict["id"].numpy().decode("utf-8"), question_text=tensor_dict["question"].numpy().decode("utf-8"), context_text=tensor_dict["context"].numpy().decode("utf-8"), answer_text=answer, start_position_character=answer_start, title=tensor_dict["title"].numpy().decode("utf-8"), answers=answers, ) def get_examples_from_dataset(self, dataset, evaluate=False): """ Creates a list of :class:`~transformers1.data.processors.squad.SquadExample` using a TFDS dataset. Args: dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")` evaluate: boolean specifying if in evaluation mode or in training mode Returns: List of SquadExample Examples:: import tensorflow_datasets as tfds dataset = tfds.load("squad") training_examples = get_examples_from_dataset(dataset, evaluate=False) evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) """ if evaluate: dataset = dataset["validation"] else: dataset = dataset["train"] examples = [] for tensor_dict in tqdm(dataset): examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples def get_train_examples(self, data_dir, filename=None): """ Returns the training examples from the data directory. Args: data_dir: Directory containing the data files used for training and evaluating. filename: None by default, specify this if the training file has a different name than the original one which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ if data_dir is None: data_dir = "" if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open( os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train") def get_dev_examples(self, data_dir, filename=None): """ Returns the evaluation example from the data directory. Args: data_dir: Directory containing the data files used for training and evaluating. filename: None by default, specify this if the evaluation file has a different name than the original one which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ if data_dir is None: data_dir = "" if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open( os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") def _create_examples(self, input_data, set_type): is_training = set_type == "train" examples = [] for entry in tqdm(input_data): title = entry["title"] for paragraph in entry["paragraphs"]: context_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position_character = None answer_text = None answers = [] if "is_impossible" in qa: is_impossible = qa["is_impossible"] else: is_impossible = False if not is_impossible: if is_training: answer = qa["answers"][0] answer_text = answer["text"] start_position_character = answer["answer_start"] else: answers = qa["answers"] example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, title=title, is_impossible=is_impossible, answers=answers, ) examples.append(example) return examples class SquadV1Processor(SquadProcessor): train_file = "train-v1.1.json" dev_file = "dev-v1.1.json" class SquadV2Processor(SquadProcessor): train_file = "train-v2.0.json" dev_file = "dev-v2.0.json" class SquadExample(object): """ A single training/test example for the Squad dataset, as loaded from disk. Args: qas_id: The example's unique identifier question_text: The question string context_text: The context string answer_text: The answer string start_position_character: The character position of the start of the answer title: The title of the example answers: None by default, this is used during evaluation. Holds answers as well as their start positions. is_impossible: False by default, set to True if the example has no possible answer. """ def __init__( self, qas_id, question_text, context_text, answer_text, start_position_character, title, answers=[], is_impossible=False, ): self.qas_id = qas_id self.question_text = question_text self.context_text = context_text self.answer_text = answer_text self.title = title self.is_impossible = is_impossible self.answers = answers self.start_position, self.end_position = 0, 0 doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True # Split on whitespace so that different tokens may be attributed to their original position. for c in self.context_text: if _is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) self.doc_tokens = doc_tokens self.char_to_word_offset = char_to_word_offset # Start and end positions only has a value during evaluation. if start_position_character is not None and not is_impossible: self.start_position = char_to_word_offset[start_position_character] self.end_position = char_to_word_offset[ min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1) ] class SquadFeatures(object): """ Single squad example features to be fed to a model. Those features are model-specific and can be crafted from :class:`~transformers1.data.processors.squad.SquadExample` using the :method:`~transformers1.data.processors.squad.squad_convert_examples_to_features` method. Args: input_ids: Indices of input sequence tokens in the vocabulary. attention_mask: Mask to avoid performing attention on padding token indices. token_type_ids: Segment token indices to indicate first and second portions of the inputs. cls_index: the index of the CLS token. p_mask: Mask identifying tokens that can be answers vs. tokens that cannot. Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer example_index: the index of the example unique_id: The unique Feature identifier paragraph_len: The length of the context token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object. If a token does not have their maximum context in this feature object, it means that another feature object has more information related to that token and should be prioritized over this feature for that token. tokens: list of tokens corresponding to the input ids token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. start_position: start of the answer token index end_position: end of the answer token index """ def __init__( self, input_ids, attention_mask, token_type_ids, cls_index, p_mask, example_index, unique_id, paragraph_len, token_is_max_context, tokens, token_to_orig_map, start_position, end_position, is_impossible, qas_id: str = None, ): self.input_ids = input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids self.cls_index = cls_index self.p_mask = p_mask self.example_index = example_index self.unique_id = unique_id self.paragraph_len = paragraph_len self.token_is_max_context = token_is_max_context self.tokens = tokens self.token_to_orig_map = token_to_orig_map self.start_position = start_position self.end_position = end_position self.is_impossible = is_impossible self.qas_id = qas_id class SquadResult(object): """ Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. Args: unique_id: The unique identifier corresponding to that example. start_logits: The logits corresponding to the start of the answer end_logits: The logits corresponding to the end of the answer """ def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): self.start_logits = start_logits self.end_logits = end_logits self.unique_id = unique_id if start_top_index: self.start_top_index = start_top_index self.end_top_index = end_top_index self.cls_logits = cls_logits ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/processors/utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import csv import dataclasses import json import logging from dataclasses import dataclass from typing import List, Optional, Union from ...file_utils import is_tf_available, is_torch_available logger = logging.getLogger(__name__) @dataclass class InputExample: """ A single training/test example for simple sequence classification. Args: guid: Unique id for the example. text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ guid: str text_a: str text_b: Optional[str] = None label: Optional[str] = None def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(dataclasses.asdict(self), indent=2) + "\n" @dataclass(frozen=True) class InputFeatures: """ A single set of features of data. Property names are the same names as the corresponding inputs to a model. Args: input_ids: Indices of input sequence tokens in the vocabulary. attention_mask: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. token_type_ids: (Optional) Segment token indices to indicate first and second portions of the inputs. Only some models use them. label: (Optional) Label corresponding to the input. Int for classification problems, float for regression problems. """ input_ids: List[int] attention_mask: Optional[List[int]] = None token_type_ids: Optional[List[int]] = None label: Optional[Union[int, float]] = None def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(dataclasses.asdict(self)) + "\n" class DataProcessor: """Base class for data converters for sequence classification data sets.""" def get_example_from_tensor_dict(self, tensor_dict): """Gets an example from a dict with tensorflow tensors Args: tensor_dict: Keys and values should match the corresponding Glue tensorflow_dataset examples. """ raise NotImplementedError() def get_train_examples(self, data_dir): """Gets a collection of `InputExample`s for the train set.""" raise NotImplementedError() def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" raise NotImplementedError() def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for the test set.""" raise NotImplementedError() def get_labels(self): """Gets the list of labels for this data set.""" raise NotImplementedError() def tfds_map(self, example): """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts examples to the correct format.""" if len(self.get_labels()) > 1: example.label = self.get_labels()[int(example.label)] return example @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" with open(input_file, "r", encoding="utf-8-sig") as f: return list(csv.reader(f, delimiter="\t", quotechar=quotechar)) class SingleSentenceClassificationProcessor(DataProcessor): """ Generic processor for a single sentence classification data set.""" def __init__(self, labels=None, examples=None, mode="classification", verbose=False): self.labels = [] if labels is None else labels self.examples = [] if examples is None else examples self.mode = mode self.verbose = verbose def __len__(self): return len(self.examples) def __getitem__(self, idx): if isinstance(idx, slice): return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx]) return self.examples[idx] @classmethod def create_from_csv( cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs ): processor = cls(**kwargs) processor.add_examples_from_csv( file_name, split_name=split_name, column_label=column_label, column_text=column_text, column_id=column_id, skip_first_row=skip_first_row, overwrite_labels=True, overwrite_examples=True, ) return processor @classmethod def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs): processor = cls(**kwargs) processor.add_examples(texts_or_text_and_labels, labels=labels) return processor def add_examples_from_csv( self, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, overwrite_labels=False, overwrite_examples=False, ): lines = self._read_tsv(file_name) if skip_first_row: lines = lines[1:] texts = [] labels = [] ids = [] for (i, line) in enumerate(lines): texts.append(line[column_text]) labels.append(line[column_label]) if column_id is not None: ids.append(line[column_id]) else: guid = "%s-%s" % (split_name, i) if split_name else "%s" % i ids.append(guid) return self.add_examples( texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples ) def add_examples( self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False ): assert labels is None or len(texts_or_text_and_labels) == len(labels) assert ids is None or len(texts_or_text_and_labels) == len(ids) if ids is None: ids = [None] * len(texts_or_text_and_labels) if labels is None: labels = [None] * len(texts_or_text_and_labels) examples = [] added_labels = set() for (text_or_text_and_label, label, guid) in zip(texts_or_text_and_labels, labels, ids): if isinstance(text_or_text_and_label, (tuple, list)) and label is None: text, label = text_or_text_and_label else: text = text_or_text_and_label added_labels.add(label) examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label)) # Update examples if overwrite_examples: self.examples = examples else: self.examples.extend(examples) # Update labels if overwrite_labels: self.labels = list(added_labels) else: self.labels = list(set(self.labels).union(added_labels)) return self.examples def get_features( self, tokenizer, max_length=None, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, return_tensors=None, ): """ Convert examples in a list of ``InputFeatures`` Args: tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(self.labels)} all_input_ids = [] for (ex_index, example) in enumerate(self.examples): if ex_index % 10000 == 0: logger.info("Tokenizing example %d", ex_index) input_ids = tokenizer.encode( example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len), ) all_input_ids.append(input_ids) batch_length = max(len(input_ids) for input_ids in all_input_ids) features = [] for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)): if ex_index % 10000 == 0: logger.info("Writing example %d/%d" % (ex_index, len(self.examples))) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = batch_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) assert len(input_ids) == batch_length, "Error with input length {} vs {}".format( len(input_ids), batch_length ) assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format( len(attention_mask), batch_length ) if self.mode == "classification": label = label_map[example.label] elif self.mode == "regression": label = float(example.label) else: raise ValueError(self.mode) if ex_index < 5 and self.verbose: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label)) if return_tensors is None: return features elif return_tensors == "tf": if not is_tf_available(): raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported") import tensorflow as tf def gen(): for ex in features: yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label) dataset = tf.data.Dataset.from_generator( gen, ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64), ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])), ) return dataset elif return_tensors == "pt": if not is_torch_available(): raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) if self.mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif self.mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset else: raise ValueError("return_tensors should be one of 'tf' or 'pt'") ================================================ FILE: code/bert-base-count5/pretrain/transformers1/data/processors/xnli.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XNLI utils (dataset loading and evaluation) """ import logging import os from .utils import DataProcessor, InputExample logger = logging.getLogger(__name__) class XnliProcessor(DataProcessor): """Processor for the XNLI dataset. Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" def __init__(self, language, train_language=None): self.language = language self.train_language = train_language def get_train_examples(self, data_dir): """See base class.""" lg = self.language if self.train_language is None else self.train_language lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % ("train", i) text_a = line[0] text_b = line[1] label = "contradiction" if line[2] == "contradictory" else line[2] assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples def get_test_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue language = line[0] if language != self.language: continue guid = "%s-%s" % ("test", i) text_a = line[6] text_b = line[7] label = line[1] assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples def get_labels(self): """See base class.""" return ["contradiction", "entailment", "neutral"] xnli_processors = { "xnli": XnliProcessor, } xnli_output_modes = { "xnli": "classification", } xnli_tasks_num_labels = { "xnli": 3, } ================================================ FILE: code/bert-base-count5/pretrain/transformers1/file.py ================================================ ================================================ FILE: code/bert-base-count5/pretrain/transformers1/file_utils.py ================================================ """ Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp Copyright by the AllenNLP authors. """ import fnmatch import json import logging import os import shutil import sys import tarfile import tempfile from contextlib import contextmanager from functools import partial, wraps from hashlib import sha256 from pathlib import Path from typing import Optional from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile import requests from filelock import FileLock from tqdm.auto import tqdm from . import __version__ logger = logging.getLogger(__name__) # pylint: disable=invalid-name try: USE_TF = os.environ.get("USE_TF", "AUTO").upper() USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"): import torch _torch_available = True # pylint: disable=invalid-name logger.info("PyTorch version {} available.".format(torch.__version__)) else: logger.info("Disabling PyTorch because USE_TF is set") _torch_available = False except ImportError: _torch_available = False # pylint: disable=invalid-name try: USE_TF = os.environ.get("USE_TF", "AUTO").upper() USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"): import tensorflow as tf assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2 _tf_available = True # pylint: disable=invalid-name logger.info("TensorFlow version {} available.".format(tf.__version__)) else: logger.info("Disabling Tensorflow because USE_TORCH is set") _tf_available = False except (ImportError, AssertionError): _tf_available = False # pylint: disable=invalid-name try: from torch.hub import _get_torch_home torch_cache_home = _get_torch_home() except ImportError: torch_cache_home = os.path.expanduser( os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch")) ) default_cache_path = os.path.join(torch_cache_home, "transformers1") PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path) PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE) TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE) WEIGHTS_NAME = "pytorch_model.bin" TF2_WEIGHTS_NAME = "tf_model.h5" TF_WEIGHTS_NAME = "model.ckpt" CONFIG_NAME = "config.json" MODEL_CARD_NAME = "modelcard.json" MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]] DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co" def is_torch_available(): return _torch_available def is_tf_available(): return _tf_available def add_start_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") return fn return docstring_decorator def add_start_docstrings_to_callable(*docstr): def docstring_decorator(fn): class_name = ":class:`~transformers1.{}`".format(fn.__qualname__.split(".")[0]) intro = " The {} forward method, overrides the :func:`__call__` special method.".format(class_name) note = r""" .. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post processing steps while the latter silently ignores them. """ fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") return fn return docstring_decorator def add_end_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = fn.__doc__ + "".join(docstr) return fn return docstring_decorator def is_remote_url(url_or_filename): parsed = urlparse(url_or_filename) return parsed.scheme in ("http", "https") def hf_bucket_url(model_id: str, filename: str, use_cdn=True) -> str: """ Resolve a model identifier, and a file name, to a HF-hosted url on either S3 or Cloudfront (a Content Delivery Network, or CDN). Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our bandwidth costs). However, it is more aggressively cached by default, so may not always reflect the latest changes to the underlying file (default TTL is 24 hours). In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3, using one or the other (or switching from one to the other) will affect caching: cached files are not shared between the two because the cached file's name contains a hash of the url. """ endpoint = CLOUDFRONT_DISTRIB_PREFIX if use_cdn else S3_BUCKET_PREFIX legacy_format = "/" not in model_id if legacy_format: return f"{endpoint}/{model_id}-{filename}" else: return f"{endpoint}/{model_id}/{filename}" def url_to_filename(url, etag=None): """ Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's, delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can identify it as a HDF5 file (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) """ url_bytes = url.encode("utf-8") url_hash = sha256(url_bytes) filename = url_hash.hexdigest() if etag: etag_bytes = etag.encode("utf-8") etag_hash = sha256(etag_bytes) filename += "." + etag_hash.hexdigest() if url.endswith(".h5"): filename += ".h5" return filename def filename_to_url(filename, cache_dir=None): """ Return the url and etag (which may be ``None``) stored for `filename`. Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): raise EnvironmentError("file {} not found".format(cache_path)) meta_path = cache_path + ".json" if not os.path.exists(meta_path): raise EnvironmentError("file {} not found".format(meta_path)) with open(meta_path, encoding="utf-8") as meta_file: metadata = json.load(meta_file) url = metadata["url"] etag = metadata["etag"] return url, etag def cached_path( url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None, extract_compressed_file=False, force_extract=False, local_files_only=False, ) -> Optional[str]: """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path. Args: cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). force_download: if True, re-dowload the file even if it's already cached in the cache dir. resume_download: if True, resume the download if incompletly recieved file is found. user_agent: Optional string or dict that will be appended to the user-agent on remote requests. extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed file in a folder along the archive. force_extract: if True when extract_compressed_file is True and the archive was already extracted, re-extract the archive and overide the folder where it was extracted. Return: None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string) otherwise """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(url_or_filename, Path): url_or_filename = str(url_or_filename) if isinstance(cache_dir, Path): cache_dir = str(cache_dir) if is_remote_url(url_or_filename): # URL, so get it from the cache (downloading if necessary) output_path = get_from_cache( url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, user_agent=user_agent, local_files_only=local_files_only, ) elif os.path.exists(url_or_filename): # File, and it exists. output_path = url_or_filename elif urlparse(url_or_filename).scheme == "": # File, but it doesn't exist. raise EnvironmentError("file {} not found".format(url_or_filename)) else: # Something unknown raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) if extract_compressed_file: if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): return output_path # Path where we extract compressed archives # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" output_dir, output_file = os.path.split(output_path) output_extract_dir_name = output_file.replace(".", "-") + "-extracted" output_path_extracted = os.path.join(output_dir, output_extract_dir_name) if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract: return output_path_extracted # Prevent parallel extractions lock_path = output_path + ".lock" with FileLock(lock_path): shutil.rmtree(output_path_extracted, ignore_errors=True) os.makedirs(output_path_extracted) if is_zipfile(output_path): with ZipFile(output_path, "r") as zip_file: zip_file.extractall(output_path_extracted) zip_file.close() elif tarfile.is_tarfile(output_path): tar_file = tarfile.open(output_path) tar_file.extractall(output_path_extracted) tar_file.close() else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) return output_path_extracted return output_path def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): ua = "transformers1/{}; python/{}".format(__version__, sys.version.split()[0]) if is_torch_available(): ua += "; torch/{}".format(torch.__version__) if is_tf_available(): ua += "; tensorflow/{}".format(tf.__version__) if isinstance(user_agent, dict): ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) elif isinstance(user_agent, str): ua += "; " + user_agent headers = {"user-agent": ua} if resume_size > 0: headers["Range"] = "bytes=%d-" % (resume_size,) response = requests.get(url, stream=True, proxies=proxies, headers=headers) if response.status_code == 416: # Range not satisfiable return content_length = response.headers.get("Content-Length") total = resume_size + int(content_length) if content_length is not None else None progress = tqdm( unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading", disable=bool(logger.getEffectiveLevel() == logging.NOTSET), ) for chunk in response.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() def get_from_cache( url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None, local_files_only=False, ) -> Optional[str]: """ Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the path to the cached file. Return: None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string) otherwise """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) os.makedirs(cache_dir, exist_ok=True) etag = None if not local_files_only: try: response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout) if response.status_code == 200: etag = response.headers.get("ETag") except (EnvironmentError, requests.exceptions.Timeout): # etag is already None pass filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible. # try to get the last downloaded one if etag is None: if os.path.exists(cache_path): return cache_path else: matching_files = [ file for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*") if not file.endswith(".json") and not file.endswith(".lock") ] if len(matching_files) > 0: return os.path.join(cache_dir, matching_files[-1]) else: # If files cannot be found and local_files_only=True, # the models might've been found if local_files_only=False # Notify the user about that if local_files_only: raise ValueError( "Cannot find the requested files in the cached path and outgoing traffic has been" " disabled. To enable model look-ups and downloads online, set 'local_files_only'" " to False." ) return None # From now on, etag is not None. if os.path.exists(cache_path) and not force_download: return cache_path # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" with FileLock(lock_path): # If the download just completed while the lock was activated. if os.path.exists(cache_path) and not force_download: # Even if returning early like here, the lock will be released. return cache_path if resume_download: incomplete_path = cache_path + ".incomplete" @contextmanager def _resumable_file_manager(): with open(incomplete_path, "a+b") as f: yield f temp_file_manager = _resumable_file_manager if os.path.exists(incomplete_path): resume_size = os.stat(incomplete_path).st_size else: resume_size = 0 else: temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) resume_size = 0 # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) logger.info("storing %s in cache at %s", url, cache_path) os.replace(temp_file.name, cache_path) logger.info("creating metadata file for %s", cache_path) meta = {"url": url, "etag": etag} meta_path = cache_path + ".json" with open(meta_path, "w") as meta_file: json.dump(meta, meta_file) return cache_path class cached_property(property): """ Descriptor that mimics @property but caches output in member variable. From tensorflow_datasets Built-in in functools from Python 3.8. """ def __get__(self, obj, objtype=None): # See docs.python.org/3/howto/descriptor.html#properties if obj is None: return self if self.fget is None: raise AttributeError("unreadable attribute") attr = "__cached_" + self.fget.__name__ cached = getattr(obj, attr, None) if cached is None: cached = self.fget(obj) setattr(obj, attr, cached) return cached def torch_required(func): # Chose a different decorator name than in tests so it's clear they are not the same. @wraps(func) def wrapper(*args, **kwargs): if is_torch_available(): return func(*args, **kwargs) else: raise ImportError(f"Method `{func.__name__}` requires PyTorch.") return wrapper def tf_required(func): # Chose a different decorator name than in tests so it's clear they are not the same. @wraps(func) def wrapper(*args, **kwargs): if is_tf_available(): return func(*args, **kwargs) else: raise ImportError(f"Method `{func.__name__}` requires TF.") return wrapper ================================================ FILE: code/bert-base-count5/pretrain/transformers1/filep.py ================================================ from transformers import GPT2LMHeadModel, GPT2Tokenizer import torch tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained('gpt2') generated = tokenizer.encode("The Manhattan bridge") context = torch.tensor([generated]) past = None for i in range(15): output, past = model(context, past=past) distribution = output[0, :] # Get the top 10 values' indices and cast them to a list top_values = distribution[-1].topk(10).indices.tolist() # Decode those into words top_words = [tokenizer.decode([x]) for x in top_values.indices.tolist()] # select words (only arbitrarily select the first three) words = words[0:3] # Cast them back to tokens which can be used as an added token selected_tokens = [tokenizer.encode(word) for word in words] generated += [argmax_token.tolist()] context = argmax_token.unsqueeze(0) print(tokenizer.decode([argmax_token.tolist()])) sequence = tokenizer.decode(generated) print(sequence) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/hf_api.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import io import os from os.path import expanduser from typing import Dict, List, Optional, Tuple import requests from tqdm import tqdm ENDPOINT = "https://huggingface.co" class S3Obj: """ Data structure that represents a file belonging to the current user. """ def __init__(self, filename: str, LastModified: str, ETag: str, Size: int, **kwargs): self.filename = filename self.LastModified = LastModified self.ETag = ETag self.Size = Size class PresignedUrl: def __init__(self, write: str, access: str, type: str, **kwargs): self.write = write self.access = access self.type = type # mime-type to send to S3. class S3Object: """ Data structure that represents a public file accessible on our S3. """ def __init__( self, key: str, # S3 object key etag: str, lastModified: str, size: int, rfilename: str, # filename relative to config.json **kwargs ): self.key = key self.etag = etag self.lastModified = lastModified self.size = size self.rfilename = rfilename class ModelInfo: """ Info about a public model accessible from our S3. """ def __init__( self, modelId: str, # id of model key: str, # S3 object key of config.json author: Optional[str] = None, downloads: Optional[int] = None, tags: List[str] = [], siblings: List[Dict] = [], # list of files that constitute the model **kwargs ): self.modelId = modelId self.key = key self.author = author self.downloads = downloads self.tags = tags self.siblings = [S3Object(**x) for x in siblings] class HfApi: def __init__(self, endpoint=None): self.endpoint = endpoint if endpoint is not None else ENDPOINT def login(self, username: str, password: str) -> str: """ Call HF API to sign in a user and get a token if credentials are valid. Outputs: token if credentials are valid Throws: requests.exceptions.HTTPError if credentials are invalid """ path = "{}/api/login".format(self.endpoint) r = requests.post(path, json={"username": username, "password": password}) r.raise_for_status() d = r.json() return d["token"] def whoami(self, token: str) -> Tuple[str, List[str]]: """ Call HF API to know "whoami" """ path = "{}/api/whoami".format(self.endpoint) r = requests.get(path, headers={"authorization": "Bearer {}".format(token)}) r.raise_for_status() d = r.json() return d["user"], d["orgs"] def logout(self, token: str) -> None: """ Call HF API to log out. """ path = "{}/api/logout".format(self.endpoint) r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}) r.raise_for_status() def presign(self, token: str, filename: str, organization: Optional[str] = None) -> PresignedUrl: """ Call HF API to get a presigned url to upload `filename` to S3. """ path = "{}/api/presign".format(self.endpoint) r = requests.post( path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename, "organization": organization}, ) r.raise_for_status() d = r.json() return PresignedUrl(**d) def presign_and_upload(self, token: str, filename: str, filepath: str, organization: Optional[str] = None) -> str: """ Get a presigned url, then upload file to S3. Outputs: url: Read-only url for the stored file on S3. """ urls = self.presign(token, filename=filename, organization=organization) # streaming upload: # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads # # Even though we presign with the correct content-type, # the client still has to specify it when uploading the file. with open(filepath, "rb") as f: pf = TqdmProgressFileReader(f) data = f if pf.total_size > 0 else "" r = requests.put(urls.write, data=data, headers={"content-type": urls.type}) r.raise_for_status() pf.close() return urls.access def list_objs(self, token: str, organization: Optional[str] = None) -> List[S3Obj]: """ Call HF API to list all stored files for user (or one of their organizations). """ path = "{}/api/listObjs".format(self.endpoint) params = {"organization": organization} if organization is not None else None r = requests.get(path, params=params, headers={"authorization": "Bearer {}".format(token)}) r.raise_for_status() d = r.json() return [S3Obj(**x) for x in d] def delete_obj(self, token: str, filename: str, organization: Optional[str] = None): """ Call HF API to delete a file stored by user """ path = "{}/api/deleteObj".format(self.endpoint) r = requests.delete( path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename, "organization": organization}, ) r.raise_for_status() def model_list(self) -> List[ModelInfo]: """ Get the public list of all the models on huggingface, including the community models """ path = "{}/api/models".format(self.endpoint) r = requests.get(path) r.raise_for_status() d = r.json() return [ModelInfo(**x) for x in d] class TqdmProgressFileReader: """ Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) and override `f.read()` so as to display a tqdm progress bar. see github.com/huggingface/transformers1/pull/2078#discussion_r354739608 for implementation details. """ def __init__(self, f: io.BufferedReader): self.f = f self.total_size = os.fstat(f.fileno()).st_size self.pbar = tqdm(total=self.total_size, leave=False) self.read = f.read f.read = self._read def _read(self, n=-1): self.pbar.update(n) return self.read(n) def close(self): self.pbar.close() class HfFolder: path_token = expanduser("~/.huggingface/token") @classmethod def save_token(cls, token): """ Save token, creating folder as needed. """ os.makedirs(os.path.dirname(cls.path_token), exist_ok=True) with open(cls.path_token, "w+") as f: f.write(token) @classmethod def get_token(cls): """ Get token or None if not existent. """ try: with open(cls.path_token, "r") as f: return f.read() except FileNotFoundError: pass @classmethod def delete_token(cls): """ Delete token. Do not fail if token does not exist. """ try: os.remove(cls.path_token) except FileNotFoundError: pass ================================================ FILE: code/bert-base-count5/pretrain/transformers1/hf_argparser.py ================================================ import dataclasses import json import sys from argparse import ArgumentParser from enum import Enum from pathlib import Path from typing import Any, Iterable, List, NewType, Tuple, Union DataClass = NewType("DataClass", Any) DataClassType = NewType("DataClassType", Any) class HfArgumentParser(ArgumentParser): """ This subclass of `argparse.ArgumentParser` uses type hints on dataclasses to generate arguments. The class is designed to play well with the native argparse. In particular, you can add more (non-dataclass backed) arguments to the parser after initialization and you'll get the output back after parsing as an additional namespace. """ dataclass_types: Iterable[DataClassType] def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]], **kwargs): """ Args: dataclass_types: Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args. kwargs: (Optional) Passed to `argparse.ArgumentParser()` in the regular way. """ super().__init__(**kwargs) if dataclasses.is_dataclass(dataclass_types): dataclass_types = [dataclass_types] self.dataclass_types = dataclass_types for dtype in self.dataclass_types: self._add_dataclass_arguments(dtype) def _add_dataclass_arguments(self, dtype: DataClassType): for field in dataclasses.fields(dtype): field_name = f"--{field.name}" kwargs = field.metadata.copy() # field.metadata is not used at all by Data Classes, # it is provided as a third-party extension mechanism. if isinstance(field.type, str): raise ImportError( "This implementation is not compatible with Postponed Evaluation of Annotations (PEP 563)," "which can be opted in from Python 3.7 with `from __future__ import annotations`." "We will add compatibility when Python 3.9 is released." ) typestring = str(field.type) for prim_type in (int, float, str): for collection in (List,): if typestring == f"typing.Union[{collection[prim_type]}, NoneType]": field.type = collection[prim_type] if typestring == f"typing.Union[{prim_type.__name__}, NoneType]": field.type = prim_type if isinstance(field.type, type) and issubclass(field.type, Enum): kwargs["choices"] = list(field.type) kwargs["type"] = field.type if field.default is not dataclasses.MISSING: kwargs["default"] = field.default elif field.type is bool: kwargs["action"] = "store_false" if field.default is True else "store_true" if field.default is True: field_name = f"--no-{field.name}" kwargs["dest"] = field.name elif hasattr(field.type, "__origin__") and issubclass(field.type.__origin__, List): kwargs["nargs"] = "+" kwargs["type"] = field.type.__args__[0] assert all( x == kwargs["type"] for x in field.type.__args__ ), "{} cannot be a List of mixed types".format(field.name) if field.default_factory is not dataclasses.MISSING: kwargs["default"] = field.default_factory() else: kwargs["type"] = field.type if field.default is not dataclasses.MISSING: kwargs["default"] = field.default else: kwargs["required"] = True self.add_argument(field_name, **kwargs) def parse_args_into_dataclasses( self, args=None, return_remaining_strings=False, look_for_args_file=True ) -> Tuple[DataClass, ...]: """ Parse command-line args into instances of the specified dataclass types. This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at: docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args Args: args: List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser) return_remaining_strings: If true, also return a list of remaining argument strings. look_for_args_file: If true, will look for a ".args" file with the same base name as the entry point script for this process, and will append its potential content to the command line args. Returns: Tuple consisting of: - the dataclass instances in the same order as they were passed to the initializer.abspath - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser after initialization. - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args) """ if look_for_args_file and len(sys.argv): args_file = Path(sys.argv[0]).with_suffix(".args") if args_file.exists(): fargs = args_file.read_text().split() args = fargs + args if args is not None else fargs + sys.argv[1:] # in case of duplicate arguments the first one has precedence # so we append rather than prepend. namespace, remaining_args = self.parse_known_args(args=args) outputs = [] for dtype in self.dataclass_types: keys = {f.name for f in dataclasses.fields(dtype)} inputs = {k: v for k, v in vars(namespace).items() if k in keys} for k in keys: delattr(namespace, k) obj = dtype(**inputs) outputs.append(obj) if len(namespace.__dict__) > 0: # additional namespace. outputs.append(namespace) if return_remaining_strings: return (*outputs, remaining_args) else: if remaining_args: raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {remaining_args}") return (*outputs,) def parse_json_file(self, json_file: str) -> Tuple[DataClass, ...]: """ Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the dataclass types. """ data = json.loads(Path(json_file).read_text()) outputs = [] for dtype in self.dataclass_types: keys = {f.name for f in dataclasses.fields(dtype)} inputs = {k: v for k, v in data.items() if k in keys} obj = dtype(**inputs) outputs.append(obj) return (*outputs,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modelcard.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Configuration base class and utilities.""" import copy import json import logging import os from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP from .file_utils import ( CONFIG_NAME, MODEL_CARD_NAME, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url, ) logger = logging.getLogger(__name__) class ModelCard: r""" Structured Model Card class. Store model card as well as methods for loading/downloading/saving model cards. Please read the following paper for details and explanation on the sections: "Model Cards for Model Reporting" by Margaret Mitchell, Simone Wu, Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer, Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards. Link: https://arxiv.org/abs/1810.03993 Note: A model card can be loaded and saved to disk. Parameters: """ def __init__(self, **kwargs): # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers) self.model_details = kwargs.pop("model_details", {}) self.intended_use = kwargs.pop("intended_use", {}) self.factors = kwargs.pop("factors", {}) self.metrics = kwargs.pop("metrics", {}) self.evaluation_data = kwargs.pop("evaluation_data", {}) self.training_data = kwargs.pop("training_data", {}) self.quantitative_analyses = kwargs.pop("quantitative_analyses", {}) self.ethical_considerations = kwargs.pop("ethical_considerations", {}) self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {}) # Open additional attributes for key, value in kwargs.items(): try: setattr(self, key, value) except AttributeError as err: logger.error("Can't set {} with value {} for {}".format(key, value, self)) raise err def save_pretrained(self, save_directory_or_file): """ Save a model card object to the directory or file `save_directory_or_file`. """ if os.path.isdir(save_directory_or_file): # If we save using the predefined names, we can load using `from_pretrained` output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME) else: output_model_card_file = save_directory_or_file self.to_json_file(output_model_card_file) logger.info("Model card saved in {}".format(output_model_card_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a :class:`~transformers1.ModelCard` from a pre-trained model model card. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a model card file saved using the :func:`~transformers1.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model card should be cached if the standard cache should not be used. kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading. - The values in kwargs of any keys which are model card attributes will be used to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. find_from_standard_name: (`optional`) boolean, default True: If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename. Can be used to directly feed a model/config url and access the colocated modelcard. return_unused_kwargs: (`optional`) bool: - If False, then this function returns just the final model card object. - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored. Examples:: modelcard = ModelCard.from_pretrained('bert-base-uncased') # Download model card from S3 and cache. modelcard = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')` modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json') modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False) """ cache_dir = kwargs.pop("cache_dir", None) proxies = kwargs.pop("proxies", None) find_from_standard_name = kwargs.pop("find_from_standard_name", True) return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: # For simplicity we use the same pretrained url than the configuration files # but with a different suffix (modelcard.json). This suffix is replaced below. model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] elif os.path.isdir(pretrained_model_name_or_path): model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): model_card_file = pretrained_model_name_or_path else: model_card_file = hf_bucket_url(pretrained_model_name_or_path, filename=MODEL_CARD_NAME, use_cdn=False) if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) model_card_file = model_card_file.replace(WEIGHTS_NAME, MODEL_CARD_NAME) model_card_file = model_card_file.replace(TF2_WEIGHTS_NAME, MODEL_CARD_NAME) try: # Load from URL or cache if already cached resolved_model_card_file = cached_path( model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False ) if resolved_model_card_file is None: raise EnvironmentError if resolved_model_card_file == model_card_file: logger.info("loading model card file {}".format(model_card_file)) else: logger.info( "loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file) ) # Load model card modelcard = cls.from_json_file(resolved_model_card_file) except (EnvironmentError, json.JSONDecodeError): # We fall back on creating an empty model card modelcard = cls() # Update model card with kwargs if needed to_remove = [] for key, value in kwargs.items(): if hasattr(modelcard, key): setattr(modelcard, key, value) to_remove.append(key) for key in to_remove: kwargs.pop(key, None) logger.info("Model card: %s", str(modelcard)) if return_unused_kwargs: return modelcard, kwargs else: return modelcard @classmethod def from_dict(cls, json_object): """Constructs a `ModelCard` from a Python dictionary of parameters.""" return cls(**json_object) @classmethod def from_json_file(cls, json_file): """Constructs a `ModelCard` from a json file of parameters.""" with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() dict_obj = json.loads(text) return cls(**dict_obj) def __eq__(self, other): return self.__dict__ == other.__dict__ def __repr__(self): return str(self.to_json_string()) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" def to_json_file(self, json_file_path): """ Save this instance to a json file.""" with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string()) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_albert.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch ALBERT model. """ import logging import math import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_albert import AlbertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer from .modeling_utils import PreTrainedModel logger = logging.getLogger(__name__) ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "albert-base-v1", "albert-large-v1", "albert-xlarge-v1", "albert-xxlarge-v1", "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", # See all ALBERT models at https://huggingface.co/models?filter=albert ] def load_tf_weights_in_albert(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): print(name) for name, array in zip(names, arrays): original_name = name # If saved from the TF HUB module name = name.replace("module/", "") # Renaming and simplifying name = name.replace("ffn_1", "ffn") name = name.replace("bert/", "albert/") name = name.replace("attention_1", "attention") name = name.replace("transform/", "") name = name.replace("LayerNorm_1", "full_layer_layer_norm") name = name.replace("LayerNorm", "attention/LayerNorm") name = name.replace("transformer/", "") # The feed forward layer had an 'intermediate' step which has been abstracted away name = name.replace("intermediate/dense/", "") name = name.replace("ffn/intermediate/output/dense/", "ffn_output/") # ALBERT attention was split between self and output which have been abstracted away name = name.replace("/output/", "/") name = name.replace("/self/", "/") # The pooler is a linear layer name = name.replace("pooler/dense", "pooler") # The classifier was simplified to predictions from cls/predictions name = name.replace("cls/predictions", "predictions") name = name.replace("predictions/attention", "predictions") # Naming was changed to be more explicit name = name.replace("embeddings/attention", "embeddings") name = name.replace("inner_group_", "albert_layers/") name = name.replace("group_", "albert_layer_groups/") # Classifier if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name): name = "classifier/" + name # No ALBERT model currently handles the next sentence prediction task if "seq_relationship" in name: name = name.replace("seq_relationship/output_", "sop_classifier/classifier/") name = name.replace("weights", "weight") name = name.split("/") # Ignore the gradients applied by the LAMB/ADAM optimizers. if ( "adam_m" in name or "adam_v" in name or "AdamWeightDecayOptimizer" in name or "AdamWeightDecayOptimizer_1" in name or "global_step" in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {} from {}".format(name, original_name)) pointer.data = torch.from_numpy(array) return model class AlbertEmbeddings(BertEmbeddings): """ Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) class AlbertAttention(BertSelfAttention): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.hidden_size = config.hidden_size self.attention_head_size = config.hidden_size // config.num_attention_heads self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.num_attention_heads, self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.query = prune_linear_layer(self.query, index) self.key = prune_linear_layer(self.key, index) self.value = prune_linear_layer(self.value, index) self.dense = prune_linear_layer(self.dense, index, dim=1) # Update hyper params and store pruned heads self.num_attention_heads = self.num_attention_heads - len(heads) self.all_head_size = self.attention_head_size * self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, input_ids, attention_mask=None, head_mask=None): mixed_query_layer = self.query(input_ids) mixed_key_layer = self.key(input_ids) mixed_value_layer = self.value(input_ids) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() # Should find a better way to do this w = ( self.dense.weight.t() .view(self.num_attention_heads, self.attention_head_size, self.hidden_size) .to(context_layer.dtype) ) b = self.dense.bias.to(context_layer.dtype) projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b projected_context_layer_dropout = self.dropout(projected_context_layer) layernormed_context_layer = self.LayerNorm(input_ids + projected_context_layer_dropout) return (layernormed_context_layer, attention_probs) if self.output_attentions else (layernormed_context_layer,) class AlbertLayer(nn.Module): def __init__(self, config): super().__init__() self.config = config self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.attention = AlbertAttention(config) self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) self.activation = ACT2FN[config.hidden_act] def forward(self, hidden_states, attention_mask=None, head_mask=None): attention_output = self.attention(hidden_states, attention_mask, head_mask) ffn_output = self.ffn(attention_output[0]) ffn_output = self.activation(ffn_output) ffn_output = self.ffn_output(ffn_output) hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0]) return (hidden_states,) + attention_output[1:] # add attentions if we output them class AlbertLayerGroup(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)]) def forward(self, hidden_states, attention_mask=None, head_mask=None): layer_hidden_states = () layer_attentions = () for layer_index, albert_layer in enumerate(self.albert_layers): layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index]) hidden_states = layer_output[0] if self.output_attentions: layer_attentions = layer_attentions + (layer_output[1],) if self.output_hidden_states: layer_hidden_states = layer_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (layer_hidden_states,) if self.output_attentions: outputs = outputs + (layer_attentions,) return outputs # last-layer hidden state, (layer hidden states), (layer attentions) class AlbertTransformer(nn.Module): def __init__(self, config): super().__init__() self.config = config self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size) self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)]) def forward(self, hidden_states, attention_mask=None, head_mask=None): hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_attentions = () if self.output_hidden_states: all_hidden_states = (hidden_states,) for i in range(self.config.num_hidden_layers): # Number of layers in a hidden group layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups) # Index of the hidden group group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) layer_group_output = self.albert_layer_groups[group_idx]( hidden_states, attention_mask, head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], ) hidden_states = layer_group_output[0] if self.output_attentions: all_attentions = all_attentions + layer_group_output[-1] if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class AlbertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = AlbertConfig base_model_prefix = "albert" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) ALBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Args: config (:class:`~transformers1.AlbertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ALBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.AlbertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.", ALBERT_START_DOCSTRING, ) class AlbertModel(AlbertPreTrainedModel): config_class = AlbertConfig load_tf_weights = load_tf_weights_in_albert base_model_prefix = "albert" def __init__(self, config): super().__init__(config) self.config = config self.embeddings = AlbertEmbeddings(config) self.encoder = AlbertTransformer(config) self.pooler = nn.Linear(config.hidden_size, config.hidden_size) self.pooler_activation = nn.Tanh() self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _resize_token_embeddings(self, new_num_tokens): old_embeddings = self.embeddings.word_embeddings new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) self.embeddings.word_embeddings = new_embeddings return self.embeddings.word_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers. These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer, while [2,3] correspond to the two inner groups of the second hidden layer. Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more information about head pruning """ for layer, heads in heads_to_prune.items(): group_idx = int(layer / self.config.inner_group_num) inner_group_idx = int(layer - group_idx * self.config.inner_group_num) self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Example:: from transformers1 import AlbertModel, AlbertTokenizer import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertModel.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) sequence_output = encoder_outputs[0] pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) outputs = (sequence_output, pooled_output) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs @add_start_docstrings( """Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `sentence order prediction (classification)` head. """, ALBERT_START_DOCSTRING, ) class AlbertForPreTraining(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.albert = AlbertModel(config) self.predictions = AlbertMLMHead(config) self.sop_classifier = AlbertSOPHead(config) self.init_weights() self.tie_weights() def tie_weights(self): self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings) def get_output_embeddings(self): return self.predictions.decoder @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, sentence_order_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import AlbertTokenizer, AlbertForPreTraining import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForPreTraining.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, sop_scores = outputs[:2] """ outputs = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output) outputs = (prediction_scores, sop_scores,) + outputs[2:] # add hidden states and attention if they are here if masked_lm_labels is not None and sentence_order_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) total_loss = masked_lm_loss + sentence_order_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, sop_scores, (hidden_states), (attentions) class AlbertMLMHead(nn.Module): def __init__(self, config): super().__init__() self.LayerNorm = nn.LayerNorm(config.embedding_size) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) self.dense = nn.Linear(config.hidden_size, config.embedding_size) self.decoder = nn.Linear(config.embedding_size, config.vocab_size) self.activation = ACT2FN[config.hidden_act] # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.LayerNorm(hidden_states) hidden_states = self.decoder(hidden_states) prediction_scores = hidden_states return prediction_scores class AlbertSOPHead(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) def forward(self, pooled_output): dropout_pooled_output = self.dropout(pooled_output) logits = self.classifier(dropout_pooled_output) return logits @add_start_docstrings( "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ) class AlbertForMaskedLM(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.albert = AlbertModel(config) self.predictions = AlbertMLMHead(config) self.init_weights() self.tie_weights() def tie_weights(self): self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings) def get_output_embeddings(self): return self.predictions.decoder @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Example:: from transformers1 import AlbertTokenizer, AlbertForMaskedLM import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForMaskedLM.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_outputs = outputs[0] prediction_scores = self.predictions(sequence_outputs) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs @add_start_docstrings( """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ALBERT_START_DOCSTRING, ) class AlbertForSequenceClassification(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) self.dropout = nn.Dropout(config.classifier_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification (or regression if config.num_labels==1) loss. logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import AlbertTokenizer, AlbertForSequenceClassification import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForSequenceClassification.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ALBERT_START_DOCSTRING, ) class AlbertForTokenClassification(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import AlbertTokenizer, AlbertForTokenClassification import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForTokenClassification.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ALBERT_START_DOCSTRING, ) class AlbertForQuestionAnswering(AlbertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` Span-start scores (before SoftMax). end_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. from transformers1 import AlbertTokenizer, AlbertForQuestionAnswering import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_dict = tokenizer.encode_plus(question, text, return_tensors='pt') start_scores, end_scores = model(**input_dict) """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Model class. """ import logging from collections import OrderedDict from .configuration_auto import ( AlbertConfig, AutoConfig, BartConfig, BertConfig, CamembertConfig, CTRLConfig, DistilBertConfig, ElectraConfig, EncoderDecoderConfig, FlaubertConfig, GPT2Config, LongformerConfig, OpenAIGPTConfig, ReformerConfig, RobertaConfig, T5Config, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLNetConfig, ) from .configuration_marian import MarianConfig from .configuration_utils import PretrainedConfig from .modeling_albert import ( AlbertForMaskedLM, AlbertForPreTraining, AlbertForQuestionAnswering, AlbertForSequenceClassification, AlbertForTokenClassification, AlbertModel, ) from .modeling_bart import BartForConditionalGeneration, BartForSequenceClassification, BartModel from .modeling_bert import ( BertForMaskedLM, BertForMultipleChoice, BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification, BertModel, ) from .modeling_camembert import ( CamembertForMaskedLM, CamembertForMultipleChoice, CamembertForSequenceClassification, CamembertForTokenClassification, CamembertModel, ) from .modeling_ctrl import CTRLLMHeadModel, CTRLModel from .modeling_distilbert import ( DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DistilBertForTokenClassification, DistilBertModel, ) from .modeling_electra import ( ElectraForMaskedLM, ElectraForPreTraining, ElectraForSequenceClassification, ElectraForTokenClassification, ElectraModel, ) from .modeling_encoder_decoder import EncoderDecoderModel from .modeling_flaubert import ( FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertModel, FlaubertWithLMHeadModel, ) from .modeling_gpt2 import GPT2LMHeadModel, GPT2Model from .modeling_longformer import ( LongformerForMaskedLM, LongformerForMultipleChoice, LongformerForQuestionAnswering, LongformerForSequenceClassification, LongformerForTokenClassification, LongformerModel, ) from .modeling_marian import MarianMTModel from .modeling_openai import OpenAIGPTLMHeadModel, OpenAIGPTModel from .modeling_reformer import ReformerModel, ReformerModelWithLMHead from .modeling_roberta import ( RobertaForMaskedLM, RobertaForMultipleChoice, RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, ) from .modeling_t5 import T5ForConditionalGeneration, T5Model from .modeling_transfo_xl import TransfoXLLMHeadModel, TransfoXLModel from .modeling_xlm import ( XLMForQuestionAnsweringSimple, XLMForSequenceClassification, XLMForTokenClassification, XLMModel, XLMWithLMHeadModel, ) from .modeling_xlm_roberta import ( XLMRobertaForMaskedLM, XLMRobertaForMultipleChoice, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLMRobertaModel, ) from .modeling_xlnet import ( XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple, XLNetForSequenceClassification, XLNetForTokenClassification, XLNetLMHeadModel, XLNetModel, ) logger = logging.getLogger(__name__) MODEL_MAPPING = OrderedDict( [ (T5Config, T5Model), (DistilBertConfig, DistilBertModel), (AlbertConfig, AlbertModel), (CamembertConfig, CamembertModel), (XLMRobertaConfig, XLMRobertaModel), (BartConfig, BartModel), (LongformerConfig, LongformerModel), (RobertaConfig, RobertaModel), (BertConfig, BertModel), (OpenAIGPTConfig, OpenAIGPTModel), (GPT2Config, GPT2Model), (TransfoXLConfig, TransfoXLModel), (XLNetConfig, XLNetModel), (FlaubertConfig, FlaubertModel), (XLMConfig, XLMModel), (CTRLConfig, CTRLModel), (ElectraConfig, ElectraModel), (ReformerConfig, ReformerModel), ] ) MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( [ (T5Config, T5ForConditionalGeneration), (DistilBertConfig, DistilBertForMaskedLM), (AlbertConfig, AlbertForPreTraining), (CamembertConfig, CamembertForMaskedLM), (XLMRobertaConfig, XLMRobertaForMaskedLM), (BartConfig, BartForConditionalGeneration), (LongformerConfig, LongformerForMaskedLM), (RobertaConfig, RobertaForMaskedLM), (BertConfig, BertForPreTraining), (OpenAIGPTConfig, OpenAIGPTLMHeadModel), (GPT2Config, GPT2LMHeadModel), (TransfoXLConfig, TransfoXLLMHeadModel), (XLNetConfig, XLNetLMHeadModel), (FlaubertConfig, FlaubertWithLMHeadModel), (XLMConfig, XLMWithLMHeadModel), (CTRLConfig, CTRLLMHeadModel), (ElectraConfig, ElectraForPreTraining), ] ) MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ (T5Config, T5ForConditionalGeneration), (DistilBertConfig, DistilBertForMaskedLM), (AlbertConfig, AlbertForMaskedLM), (CamembertConfig, CamembertForMaskedLM), (XLMRobertaConfig, XLMRobertaForMaskedLM), (MarianConfig, MarianMTModel), (BartConfig, BartForConditionalGeneration), (LongformerConfig, LongformerForMaskedLM), (RobertaConfig, RobertaForMaskedLM), (BertConfig, BertForMaskedLM), (OpenAIGPTConfig, OpenAIGPTLMHeadModel), (GPT2Config, GPT2LMHeadModel), (TransfoXLConfig, TransfoXLLMHeadModel), (XLNetConfig, XLNetLMHeadModel), (FlaubertConfig, FlaubertWithLMHeadModel), (XLMConfig, XLMWithLMHeadModel), (CTRLConfig, CTRLLMHeadModel), (ElectraConfig, ElectraForMaskedLM), (EncoderDecoderConfig, EncoderDecoderModel), (ReformerConfig, ReformerModelWithLMHead), ] ) MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, DistilBertForSequenceClassification), (AlbertConfig, AlbertForSequenceClassification), (CamembertConfig, CamembertForSequenceClassification), (XLMRobertaConfig, XLMRobertaForSequenceClassification), (BartConfig, BartForSequenceClassification), (LongformerConfig, LongformerForSequenceClassification), (RobertaConfig, RobertaForSequenceClassification), (BertConfig, BertForSequenceClassification), (XLNetConfig, XLNetForSequenceClassification), (FlaubertConfig, FlaubertForSequenceClassification), (XLMConfig, XLMForSequenceClassification), (ElectraConfig, ElectraForSequenceClassification), ] ) MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ (DistilBertConfig, DistilBertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering), (LongformerConfig, LongformerForQuestionAnswering), (RobertaConfig, RobertaForQuestionAnswering), (BertConfig, BertForQuestionAnswering), (XLNetConfig, XLNetForQuestionAnsweringSimple), (FlaubertConfig, FlaubertForQuestionAnsweringSimple), (XLMConfig, XLMForQuestionAnsweringSimple), ] ) MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, DistilBertForTokenClassification), (CamembertConfig, CamembertForTokenClassification), (XLMConfig, XLMForTokenClassification), (XLMRobertaConfig, XLMRobertaForTokenClassification), (LongformerConfig, LongformerForTokenClassification), (RobertaConfig, RobertaForTokenClassification), (BertConfig, BertForTokenClassification), (XLNetConfig, XLNetForTokenClassification), (AlbertConfig, AlbertForTokenClassification), (ElectraConfig, ElectraForTokenClassification), ] ) MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [ (CamembertConfig, CamembertForMultipleChoice), (XLMRobertaConfig, XLMRobertaForMultipleChoice), (LongformerConfig, LongformerForMultipleChoice), (RobertaConfig, RobertaForMultipleChoice), (BertConfig, BertForMultipleChoice), (XLNetConfig, XLNetForMultipleChoice), ] ) class AutoModel: r""" :class:`~transformers1.AutoModel` is a generic model class that will be instantiated as one of the base model classes of the library when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or the `AutoModel.from_config(config)` class methods. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModel is designed to be instantiated " "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModel.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertModel` (DistilBERT model) - isInstance of `longformer` configuration class: :class:`~transformers1.LongformerModel` (Longformer model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaModel` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertModel` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.OpenAIGPTModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.GPT2Model` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.CTRLModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TransfoXLModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMModel` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertModel` (Flaubert model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraModel` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the base model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5Model` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertModel` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertModel` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertModel` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaModel` (XLM-RoBERTa model) - `longformer` :class:`~transformers1.LongformerModel` (Longformer model) - `roberta`: :class:`~transformers1.RobertaModel` (RoBERTa model) - `bert`: :class:`~transformers1.BertModel` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2Model` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetModel` (XLNet model) - `xlm`: :class:`~transformers1.XLMModel` (XLM model) - `ctrl`: :class:`~transformers1.CTRLModel` (Salesforce CTRL model) - `flaubert`: :class:`~transformers1.FlaubertModel` (Flaubert model) - `electra`: :class:`~transformers1.ElectraModel` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_MAPPING.keys()) ) ) class AutoModelForPreTraining: r""" :class:`~transformers1.AutoModelForPreTraining` is a generic model class that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForPreTraining is designed to be instantiated " "using the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForPreTraining.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - isInstance of `longformer` configuration class: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertForPreTraining` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraForPreTraining` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5ModelWithLMHead` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForMaskedLM` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertForMaskedLM` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForMaskedLM` (XLM-RoBERTa model) - `longformer`: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - `roberta`: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - `bert`: :class:`~transformers1.BertForPreTraining` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - `xlm`: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - `ctrl`: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - `flaubert`: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - `electra`: :class:`~transformers1.ElectraForPreTraining` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForPreTraining.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForPreTraining.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForPreTraining.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) class AutoModelWithLMHead: r""" :class:`~transformers1.AutoModelWithLMHead` is a generic model class that will be instantiated as one of the language modeling model classes of the library when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelWithLMHead is designed to be instantiated " "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelWithLMHead.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - isInstance of `longformer` configuration class: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertForMaskedLM` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraForMaskedLM` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the language modeling model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.T5ModelWithLMHead` (T5 model) - `distilbert`: :class:`~transformers1.DistilBertForMaskedLM` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForMaskedLM` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertForMaskedLM` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForMaskedLM` (XLM-RoBERTa model) - `longformer`: :class:`~transformers1.LongformerForMaskedLM` (Longformer model) - `roberta`: :class:`~transformers1.RobertaForMaskedLM` (RoBERTa model) - `bert`: :class:`~transformers1.BertForMaskedLM` (Bert model) - `openai-gpt`: :class:`~transformers1.OpenAIGPTLMHeadModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.GPT2LMHeadModel` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TransfoXLLMHeadModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.XLNetLMHeadModel` (XLNet model) - `xlm`: :class:`~transformers1.XLMWithLMHeadModel` (XLM model) - `ctrl`: :class:`~transformers1.CTRLLMHeadModel` (Salesforce CTRL model) - `flaubert`: :class:`~transformers1.FlaubertWithLMHeadModel` (Flaubert model) - `electra`: :class:`~transformers1.ElectraForMaskedLM` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) class AutoModelForSequenceClassification: r""" :class:`~transformers1.AutoModelForSequenceClassification` is a generic model class that will be instantiated as one of the sequence classification model classes of the library when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForSequenceClassification is designed to be instantiated " "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForSequenceClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForSequenceClassification` (DistilBERT model) - isInstance of `albert` configuration class: :class:`~transformers1.AlbertForSequenceClassification` (ALBERT model) - isInstance of `camembert` configuration class: :class:`~transformers1.CamembertForSequenceClassification` (CamemBERT model) - isInstance of `xlm roberta` configuration class: :class:`~transformers1.XLMRobertaForSequenceClassification` (XLM-RoBERTa model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaForSequenceClassification` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.BertForSequenceClassification` (Bert model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetForSequenceClassification` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMForSequenceClassification` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertForSequenceClassification` (Flaubert model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the sequence classification model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: :class:`~transformers1.DistilBertForSequenceClassification` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForSequenceClassification` (ALBERT model) - `camembert`: :class:`~transformers1.CamembertForSequenceClassification` (CamemBERT model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForSequenceClassification` (XLM-RoBERTa model) - `roberta`: :class:`~transformers1.RobertaForSequenceClassification` (RoBERTa model) - `bert`: :class:`~transformers1.BertForSequenceClassification` (Bert model) - `xlnet`: :class:`~transformers1.XLNetForSequenceClassification` (XLNet model) - `flaubert`: :class:`~transformers1.FlaubertForSequenceClassification` (Flaubert model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaining positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) class AutoModelForQuestionAnswering: r""" :class:`~transformers1.AutoModelForQuestionAnswering` is a generic model class that will be instantiated as one of the question answering model classes of the library when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForQuestionAnswering is designed to be instantiated " "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForQuestionAnswering.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertForQuestionAnswering` (DistilBERT model) - isInstance of `albert` configuration class: :class:`~transformers1.AlbertForQuestionAnswering` (ALBERT model) - isInstance of `bert` configuration class: :class:`~transformers1.BertModelForQuestionAnswering` (Bert model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetForQuestionAnswering` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMForQuestionAnswering` (XLM model) - isInstance of `flaubert` configuration class: :class:`~transformers1.FlaubertForQuestionAnswering` (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForQuestionAnswering.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: :class:`~transformers1.DistilBertForQuestionAnswering` (DistilBERT model) - `albert`: :class:`~transformers1.AlbertForQuestionAnswering` (ALBERT model) - `bert`: :class:`~transformers1.BertForQuestionAnswering` (Bert model) - `xlnet`: :class:`~transformers1.XLNetForQuestionAnswering` (XLNet model) - `xlm`: :class:`~transformers1.XLMForQuestionAnswering` (XLM model) - `flaubert`: :class:`~transformers1.FlaubertForQuestionAnswering` (XLM model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) class AutoModelForTokenClassification: r""" :class:`~transformers1.AutoModelForTokenClassification` is a generic model class that will be instantiated as one of the token classification model classes of the library when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForTokenClassification is designed to be instantiated " "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForTokenClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.DistilBertModelForTokenClassification` (DistilBERT model) - isInstance of `xlm` configuration class: :class:`~transformers1.XLMForTokenClassification` (XLM model) - isInstance of `xlm roberta` configuration class: :class:`~transformers1.XLMRobertaModelForTokenClassification` (XLMRoberta model) - isInstance of `bert` configuration class: :class:`~transformers1.BertModelForTokenClassification` (Bert model) - isInstance of `albert` configuration class: :class:`~transformers1.AlbertForTokenClassification` (AlBert model) - isInstance of `xlnet` configuration class: :class:`~transformers1.XLNetModelForTokenClassification` (XLNet model) - isInstance of `camembert` configuration class: :class:`~transformers1.CamembertModelForTokenClassification` (Camembert model) - isInstance of `roberta` configuration class: :class:`~transformers1.RobertaModelForTokenClassification` (Roberta model) - isInstance of `electra` configuration class: :class:`~transformers1.ElectraForTokenClassification` (Electra model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForTokenClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: :class:`~transformers1.DistilBertForTokenClassification` (DistilBERT model) - `xlm`: :class:`~transformers1.XLMForTokenClassification` (XLM model) - `xlm-roberta`: :class:`~transformers1.XLMRobertaForTokenClassification` (XLM-RoBERTa?Para model) - `camembert`: :class:`~transformers1.CamembertForTokenClassification` (Camembert model) - `bert`: :class:`~transformers1.BertForTokenClassification` (Bert model) - `xlnet`: :class:`~transformers1.XLNetForTokenClassification` (XLNet model) - `roberta`: :class:`~transformers1.RobertaForTokenClassification` (Roberta model) - `electra`: :class:`~transformers1.ElectraForTokenClassification` (Electra model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: These arguments will be passed to the configuration and the model. Examples:: model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelForTokenClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) class AutoModelForMultipleChoice: r""" :class:`~transformers1.AutoModelForMultipleChoice` is a generic model class that will be instantiated as one of the multiple choice model classes of the library when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "AutoModelForMultipleChoice is designed to be instantiated " "using the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForMultipleChoice.from_config(config)` methods." ) @classmethod def from_config(cls, config): for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_bart.py ================================================ # coding=utf-8 # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch BART model, ported from the fairseq repo.""" import logging import math import random from typing import Dict, List, Optional, Tuple import numpy as np import torch import torch.nn.functional as F from torch import Tensor, nn from .activations import ACT2FN from .configuration_bart import BartConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, create_position_ids_from_input_ids logger = logging.getLogger(__name__) BART_PRETRAINED_MODEL_ARCHIVE_LIST = [ "facebook/bart-large", "facebook/bart-large-mnli", "facebook/bart-large-cnn", "facebook/bart-large-xsum", "facebook/mbart-large-en-ro", # See all BART models at https://huggingface.co/models?filter=bart ] BART_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and behavior. Parameters: config (:class:`~transformers1.BartConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ BART_GENERATION_EXAMPLE = r""" Examples:: from transformers1 import BartTokenizer, BartForConditionalGeneration, BartConfig # see ``examples/summarization/bart/evaluate_cnn.py`` for a longer example model = BartForConditionalGeneration.from_pretrained('bart-large-cnn') tokenizer = BartTokenizer.from_pretrained('bart-large-cnn') ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') # Generate Summary summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) """ BART_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Use BartTokenizer.encode to produce them. Padding will be ignored by default should you provide it. Indices can be obtained using :class:`transformers1.BartTokenizer.encode(text)`. attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices in input_ids. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for translation and summarization training. By default, the model will create this tensor by shifting the input_ids right, following the paper. decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. If you want to change padding behavior, you should read :func:`~transformers1.modeling_bart._prepare_decoder_inputs` and modify. See diagram 1 in the paper for more info on the default strategy """ def invert_mask(attention_mask): assert attention_mask.dim() == 2 return attention_mask.eq(0) def _prepare_bart_decoder_inputs( config, input_ids, decoder_input_ids=None, decoder_padding_mask=None, causal_mask_dtype=torch.float32 ): """Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided. This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during generation """ pad_token_id = config.pad_token_id if decoder_input_ids is None: decoder_input_ids = shift_tokens_right(input_ids, pad_token_id) bsz, tgt_len = decoder_input_ids.size() if decoder_padding_mask is None: decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id) else: decoder_padding_mask = invert_mask(decoder_padding_mask) causal_mask = torch.triu(fill_with_neg_inf(torch.zeros(tgt_len, tgt_len)), 1).to( dtype=causal_mask_dtype, device=decoder_input_ids.device ) return decoder_input_ids, decoder_padding_mask, causal_mask class PretrainedBartModel(PreTrainedModel): config_class = BartConfig base_model_prefix = "model" def _init_weights(self, module): std = self.config.init_std if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, SinusoidalPositionalEmbedding): pass elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() @property def dummy_inputs(self): pad_token = self.config.pad_token_id input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) dummy_inputs = { "attention_mask": input_ids.ne(pad_token), "input_ids": input_ids, } return dummy_inputs def _make_linear_from_emb(emb): vocab_size, emb_size = emb.weight.shape lin_layer = nn.Linear(vocab_size, emb_size, bias=False) lin_layer.weight.data = emb.weight.data return lin_layer # Helper Functions, mostly for making masks def _check_shapes(shape_1, shape2): if shape_1 != shape2: raise AssertionError("shape mismatch: {} != {}".format(shape_1, shape2)) def shift_tokens_right(input_ids, pad_token_id): """Shift input ids one token to the right, and wrap the last non pad token (usually ).""" prev_output_tokens = input_ids.clone() index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1) prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze() prev_output_tokens[:, 1:] = input_ids[:, :-1] return prev_output_tokens def make_padding_mask(input_ids, padding_idx=1): """True for pad tokens""" padding_mask = input_ids.eq(padding_idx) if not padding_mask.any(): padding_mask = None return padding_mask # Helper Modules class EncoderLayer(nn.Module): def __init__(self, config: BartConfig): super().__init__() self.embed_dim = config.d_model self.output_attentions = config.output_attentions self.self_attn = SelfAttention( self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, ) self.normalize_before = config.normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) def forward(self, x, encoder_padding_mask): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. for t_tgt, t_src is excluded (or masked out), =0 means it is included in attention Returns: encoded output of shape `(seq_len, batch, embed_dim)` """ residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) x, attn_weights = self.self_attn( query=x, key=x, key_padding_mask=encoder_padding_mask, need_weights=self.output_attentions ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.self_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = F.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.final_layer_norm(x) return x, attn_weights class BartEncoder(nn.Module): """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a :class:`EncoderLayer`. Args: config: BartConfig """ def __init__(self, config: BartConfig, embed_tokens): super().__init__() self.dropout = config.dropout self.layerdrop = config.encoder_layerdrop self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states embed_dim = embed_tokens.embedding_dim self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 self.padding_idx = embed_tokens.padding_idx self.max_source_positions = config.max_position_embeddings self.embed_tokens = embed_tokens if config.static_position_embeddings: self.embed_positions = SinusoidalPositionalEmbedding( config.max_position_embeddings, embed_dim, self.padding_idx ) else: self.embed_positions = LearnedPositionalEmbedding( config.max_position_embeddings, embed_dim, self.padding_idx, ) self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)]) self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity() # mbart has one extra layer_norm self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None def forward( self, input_ids, attention_mask=None, ): """ Args: input_ids (LongTensor): tokens in the source language of shape `(batch, src_len)` attention_mask (torch.LongTensor): indicating which indices are padding tokens. Returns: Tuple comprised of: - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *self.output_hidden_states:* is True. - **all_attentions** (List[Tensor]): Attention weights for each layer. During training might not be of length n_layers because of layer dropout. """ # check attention mask and invert if attention_mask is not None: attention_mask = invert_mask(attention_mask) inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale embed_pos = self.embed_positions(input_ids) x = inputs_embeds + embed_pos x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states, all_attentions = [], [] for encoder_layer in self.layers: if self.output_hidden_states: encoder_states.append(x) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): # skip the layer attn = None else: x, attn = encoder_layer(x, attention_mask) if self.output_attentions: all_attentions.append(attn) if self.layer_norm: x = self.layer_norm(x) if self.output_hidden_states: encoder_states.append(x) # T x B x C -> B x T x C encoder_states = [hidden_state.transpose(0, 1) for hidden_state in encoder_states] x = x.transpose(0, 1) return x, encoder_states, all_attentions class DecoderLayer(nn.Module): def __init__(self, config: BartConfig): super().__init__() self.embed_dim = config.d_model self.output_attentions = config.output_attentions self.self_attn = SelfAttention( embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.normalize_before = config.normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.encoder_attn = SelfAttention( self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) def forward( self, x, encoder_hidden_states, encoder_attn_mask=None, layer_state=None, causal_mask=None, decoder_padding_mask=None, ): residual = x if layer_state is None: layer_state = {} if self.normalize_before: x = self.self_attn_layer_norm(x) # Self Attention x, self_attn_weights = self.self_attn( query=x, key=x, layer_state=layer_state, # adds keys to layer state key_padding_mask=decoder_padding_mask, attn_mask=causal_mask, need_weights=self.output_attentions, ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.self_attn_layer_norm(x) # Cross attention residual = x assert self.encoder_attn.cache_key != self.self_attn.cache_key if self.normalize_before: x = self.encoder_attn_layer_norm(x) x, _ = self.encoder_attn( query=x, key=encoder_hidden_states, key_padding_mask=encoder_attn_mask, layer_state=layer_state, # mutates layer state ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.encoder_attn_layer_norm(x) # Fully Connected residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = F.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x if not self.normalize_before: x = self.final_layer_norm(x) return ( x, self_attn_weights, layer_state, ) # just self_attn weights for now, following t5, layer_state = cache for decoding class BartDecoder(nn.Module): """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DecoderLayer`. Args: config: BartConfig embed_tokens (torch.nn.Embedding): output embedding """ def __init__(self, config: BartConfig, embed_tokens: nn.Embedding): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.dropout = config.dropout self.layerdrop = config.decoder_layerdrop self.padding_idx = embed_tokens.padding_idx self.max_target_positions = config.max_position_embeddings self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 self.embed_tokens = embed_tokens if config.static_position_embeddings: self.embed_positions = SinusoidalPositionalEmbedding( config.max_position_embeddings, config.d_model, config.pad_token_id ) else: self.embed_positions = LearnedPositionalEmbedding( config.max_position_embeddings, config.d_model, self.padding_idx, ) self.layers = nn.ModuleList( [DecoderLayer(config) for _ in range(config.decoder_layers)] ) # type: List[DecoderLayer] self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity() self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None def forward( self, input_ids, encoder_hidden_states, encoder_padding_mask, decoder_padding_mask, decoder_causal_mask, decoder_cached_states=None, use_cache=False, **unused ): """ Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: input_ids (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_hidden_states: output from the encoder, used for encoder-side attention encoder_padding_mask: for ignoring pad tokens decoder_cached_states (dict or None): dictionary used for storing state during generation Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - hidden states - attentions """ # check attention mask and invert if encoder_padding_mask is not None: encoder_padding_mask = invert_mask(encoder_padding_mask) # embed positions positions = self.embed_positions(input_ids, use_cache=use_cache) if use_cache: input_ids = input_ids[:, -1:] positions = positions[:, -1:] # happens after we embed them # assert input_ids.ne(self.padding_idx).any() x = self.embed_tokens(input_ids) * self.embed_scale x += positions x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) # decoder layers all_hidden_states = () all_self_attns = () next_decoder_cache = [] for idx, decoder_layer in enumerate(self.layers): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if self.output_hidden_states: all_hidden_states += (x,) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): continue layer_state = decoder_cached_states[idx] if decoder_cached_states is not None else None x, layer_self_attn, layer_past = decoder_layer( x, encoder_hidden_states, encoder_attn_mask=encoder_padding_mask, decoder_padding_mask=decoder_padding_mask, layer_state=layer_state, causal_mask=decoder_causal_mask, ) if use_cache: next_decoder_cache.append(layer_past.copy()) if self.layer_norm and (idx == len(self.layers) - 1): # last layer of mbart x = self.layer_norm(x) if self.output_attentions: all_self_attns += (layer_self_attn,) # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) all_hidden_states = [hidden_state.transpose(0, 1) for hidden_state in all_hidden_states] x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) if use_cache: next_cache = ((encoder_hidden_states, encoder_padding_mask), next_decoder_cache) else: next_cache = None return x, next_cache, all_hidden_states, list(all_self_attns) def _reorder_buffer(attn_cache, new_order): for k, input_buffer_k in attn_cache.items(): if input_buffer_k is not None: attn_cache[k] = input_buffer_k.index_select(0, new_order) return attn_cache class SelfAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( self, embed_dim, num_heads, dropout=0.0, bias=True, encoder_decoder_attention=False, # otherwise self_attention ): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.scaling = self.head_dim ** -0.5 self.encoder_decoder_attention = encoder_decoder_attention self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self" def _shape(self, tensor, dim_0, bsz): return tensor.contiguous().view(dim_0, bsz * self.num_heads, self.head_dim).transpose(0, 1) def forward( self, query, key: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, layer_state: Optional[Dict[str, Optional[Tensor]]] = None, attn_mask: Optional[Tensor] = None, need_weights=False, ) -> Tuple[Tensor, Optional[Tensor]]: """Input shape: Time(SeqLen) x Batch x Channel""" static_kv: bool = self.encoder_decoder_attention tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] # get here for encoder decoder cause of static_kv if layer_state is not None: # reuse k,v and encoder_padding_mask saved_state = layer_state.get(self.cache_key, {}) if "prev_key" in saved_state: # previous time steps are cached - no need to recompute key and value if they are static if static_kv: key = None else: saved_state = None layer_state = {} q = self.q_proj(query) * self.scaling if static_kv: if key is None: k = v = None else: k = self.k_proj(key) v = self.v_proj(key) else: k = self.k_proj(query) v = self.v_proj(query) q = self._shape(q, tgt_len, bsz) if k is not None: k = self._shape(k, -1, bsz) if v is not None: v = self._shape(v, -1, bsz) if saved_state is not None: k, v, key_padding_mask = self._use_saved_state(k, v, saved_state, key_padding_mask, static_kv, bsz) # Update cache layer_state[self.cache_key] = { "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim), "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim), "prev_key_padding_mask": key_padding_mask if not static_kv else None, } assert k is not None src_len = k.size(1) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len) if attn_mask is not None: attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) # This is part of a workaround to get around fork/join parallelism not supporting Optional types. if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len,) if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2) attn_weights = attn_weights.masked_fill(reshaped, float("-inf")) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = F.softmax(attn_weights, dim=-1) attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,) assert v is not None attn_output = torch.bmm(attn_probs, v) assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn_output = self.out_proj(attn_output) if need_weights: attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) else: attn_weights = None return attn_output, attn_weights def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz): # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) assert k is not None and v is not None prev_key_padding_mask: Optional[Tensor] = saved_state.get("prev_key_padding_mask", None) key_padding_mask = self._cat_prev_key_padding_mask( key_padding_mask, prev_key_padding_mask, bsz, k.size(1), static_kv ) return k, v, key_padding_mask @staticmethod def _cat_prev_key_padding_mask( key_padding_mask: Optional[Tensor], prev_key_padding_mask: Optional[Tensor], batch_size: int, src_len: int, static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None: if static_kv: new_key_padding_mask = prev_key_padding_mask else: new_key_padding_mask = torch.cat([prev_key_padding_mask, key_padding_mask], dim=1) elif key_padding_mask is not None: filler = torch.zeros( batch_size, src_len - key_padding_mask.size(1), dtype=key_padding_mask.dtype, device=key_padding_mask.device, ) new_key_padding_mask = torch.cat([filler, key_padding_mask], dim=1) else: new_key_padding_mask = prev_key_padding_mask return new_key_padding_mask class BartClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" # This can trivially be shared with RobertaClassificationHead def __init__( self, input_dim, inner_dim, num_classes, pooler_dropout, ): super().__init__() self.dense = nn.Linear(input_dim, inner_dim) self.dropout = nn.Dropout(p=pooler_dropout) self.out_proj = nn.Linear(inner_dim, num_classes) def forward(self, x): x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x class LearnedPositionalEmbedding(nn.Embedding): """ This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to the forward function. """ def __init__( self, num_embeddings: int, embedding_dim: int, padding_idx: int, ): # if padding_idx is specified then offset the embedding ids by # this index and adjust num_embeddings appropriately assert padding_idx is not None num_embeddings += padding_idx + 1 # WHY? super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx) def forward(self, input, use_cache=False): """Input is expected to be of size [bsz x seqlen].""" if use_cache: # the position is our current step in the decoded sequence pos = int(self.padding_idx + input.size(1)) positions = input.data.new(1, 1).fill_(pos) else: positions = create_position_ids_from_input_ids(input, self.padding_idx) return super().forward(positions) def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True): if torch.cuda.is_available(): try: from apex.normalization import FusedLayerNorm return FusedLayerNorm(normalized_shape, eps, elementwise_affine) except ImportError: pass return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) def fill_with_neg_inf(t): """FP16-compatible function that fills a input_ids with -inf.""" return t.float().fill_(float("-inf")).type_as(t) def _filter_out_falsey_values(tup) -> Tuple: """Remove entries that are None or [] from an iterable.""" return tuple(x for x in tup if isinstance(x, torch.Tensor) or x) # Public API def _get_shape(t): return getattr(t, "shape", None) @add_start_docstrings( "The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING, ) class BartModel(PretrainedBartModel): def __init__(self, config: BartConfig): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states padding_idx, vocab_size = config.pad_token_id, config.vocab_size self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) self.encoder = BartEncoder(config, self.shared) self.decoder = BartDecoder(config, self.shared) self.init_weights() @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, encoder_outputs: Optional[Tuple] = None, decoder_attention_mask=None, decoder_cached_states=None, use_cache=False, ): # make masks if user doesn't supply if not use_cache: decoder_input_ids, decoder_padding_mask, causal_mask = _prepare_bart_decoder_inputs( self.config, input_ids, decoder_input_ids=decoder_input_ids, decoder_padding_mask=decoder_attention_mask, causal_mask_dtype=self.shared.weight.dtype, ) else: decoder_padding_mask, causal_mask = None, None assert decoder_input_ids is not None if encoder_outputs is None: encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) assert isinstance(encoder_outputs, tuple) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) decoder_outputs = self.decoder( decoder_input_ids, encoder_outputs[0], attention_mask, decoder_padding_mask, decoder_causal_mask=causal_mask, decoder_cached_states=decoder_cached_states, use_cache=use_cache, ) # Attention and hidden_states will be [] or None if they aren't needed decoder_outputs: Tuple = _filter_out_falsey_values(decoder_outputs) assert isinstance(decoder_outputs[0], torch.Tensor) encoder_outputs: Tuple = _filter_out_falsey_values(encoder_outputs) return decoder_outputs + encoder_outputs def get_input_embeddings(self): return self.shared def set_input_embeddings(self, value): self.shared = value self.encoder.embed_tokens = self.shared self.decoder.embed_tokens = self.shared def get_output_embeddings(self): return _make_linear_from_emb(self.shared) # make it on the fly @add_start_docstrings( "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING + BART_GENERATION_EXAMPLE, ) class BartForConditionalGeneration(PretrainedBartModel): base_model_prefix = "model" def __init__(self, config: BartConfig): super().__init__(config) base_model = BartModel(config) self.model = base_model self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding: old_num_tokens = self.model.shared.num_embeddings new_embeddings = super().resize_token_embeddings(new_num_tokens) self.model.shared = new_embeddings self._resize_final_logits_bias(new_num_tokens, old_num_tokens) return new_embeddings def _resize_final_logits_bias(self, new_num_tokens: int, old_num_tokens: int) -> None: if new_num_tokens <= old_num_tokens: new_bias = self.final_logits_bias[:, :new_num_tokens] else: extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) def forward( self, input_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_cached_states=None, lm_labels=None, use_cache=False, **unused ): r""" lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # Mask filling only works for bart-large from transformers1 import BartTokenizer, BartForConditionalGeneration tokenizer = BartTokenizer.from_pretrained('bart-large') TXT = "My friends are but they eat too many carbs." model = BartForConditionalGeneration.from_pretrained('bart-large') input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids'] logits = model(input_ids)[0] masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() probs = logits[0, masked_index].softmax(dim=0) values, predictions = probs.topk(5) tokenizer.decode(predictions).split() # ['good', 'great', 'all', 'really', 'very'] """ outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, encoder_outputs=encoder_outputs, decoder_attention_mask=decoder_attention_mask, decoder_cached_states=decoder_cached_states, use_cache=use_cache, ) lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias) outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here if lm_labels is not None: loss_fct = nn.CrossEntropyLoss() # TODO(SS): do we need to ignore pad tokens in lm_labels? masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step, decoder_cached_states are empty if not past[1]: encoder_outputs, decoder_cached_states = past, None else: encoder_outputs, decoder_cached_states = past return { "input_ids": None, # encoder_outputs is defined. input_ids not needed "encoder_outputs": encoder_outputs, "decoder_cached_states": decoder_cached_states, "decoder_input_ids": decoder_input_ids, "attention_mask": attention_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } def prepare_logits_for_generation(self, logits, cur_len, max_length): if cur_len == 1: self._force_token_ids_generation(logits, self.config.bos_token_id) if cur_len == max_length - 1 and self.config.eos_token_id is not None: self._force_token_ids_generation(logits, self.config.eos_token_id) return logits def _force_token_ids_generation(self, scores, token_ids) -> None: """force one of token_ids to be generated by setting prob of all other tokens to 0""" if isinstance(token_ids, int): token_ids = [token_ids] all_but_token_ids_mask = torch.tensor( [x for x in range(self.config.vocab_size) if x not in token_ids], dtype=torch.long, device=next(self.parameters()).device, ) assert len(scores.shape) == 2, "scores should be of rank 2 with shape: [batch_size, vocab_size]" scores[:, all_but_token_ids_mask] = -float("inf") @staticmethod def _reorder_cache(past, beam_idx): ((enc_out, enc_mask), decoder_cached_states) = past reordered_past = [] for layer_past in decoder_cached_states: # get the correct batch idx from decoder layer's batch dim for cross and self-attn layer_past_new = { attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items() } reordered_past.append(layer_past_new) new_enc_out = enc_out if enc_out is None else enc_out.index_select(0, beam_idx) new_enc_mask = enc_mask if enc_mask is None else enc_mask.index_select(0, beam_idx) past = ((new_enc_out, new_enc_mask), reordered_past) return past def get_encoder(self): return self.model.encoder def get_output_embeddings(self): return _make_linear_from_emb(self.model.shared) # make it on the fly @add_start_docstrings( """Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BART_START_DOCSTRING, ) class BartForSequenceClassification(PretrainedBartModel): def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classif_dropout, ) self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) def forward( self, input_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BartConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification loss (cross entropy) logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BartTokenizer, BartForSequenceClassification import torch tokenizer = BartTokenizer.from_pretrained('bart-large') model = BartForSequenceClassification.from_pretrained('bart-large') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, ) x = outputs[0] # last hidden state eos_mask = input_ids.eq(self.config.eos_token_id) if len(torch.unique(eos_mask.sum(1))) > 1: raise ValueError("All examples must have the same number of tokens.") sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :] logits = self.classification_head(sentence_representation) # Prepend logits outputs = (logits,) + outputs[1:] # Add hidden states and attention if they are here if labels is not None: # prepend loss to output, loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs class SinusoidalPositionalEmbedding(nn.Embedding): """This module produces sinusoidal positional embeddings of any length.""" def __init__(self, num_positions, embedding_dim, padding_idx=None): super().__init__(num_positions, embedding_dim) if embedding_dim % 2 != 0: raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported") self.weight = self._init_weight(self.weight) @staticmethod def _init_weight(out: nn.Parameter): """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in the 2nd half of the vector. [dim // 2:] """ n_pos, dim = out.shape position_enc = np.array( [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)] ) out[:, 0 : dim // 2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) # This line breaks for odd n_pos out[:, dim // 2 :] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False return out @torch.no_grad() def forward(self, input_ids, use_cache=False): """Input is expected to be of size [bsz x seqlen].""" bsz, seq_len = input_ids.shape[:2] if use_cache: positions = input_ids.data.new(1, 1).fill_(seq_len - 1) # called before slicing else: # starts at 0, ends at 1-seq_len positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device) return super().forward(positions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_beam_search.py ================================================ # coding=utf-8 # Copyright (c) 2019 Yang Liu # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """ A general wrapper around models with LM heads to generate sequences using beam search. """ import torch from torch import nn class TransformerBeamSearch(nn.Module): def __init__( self, model, tokenizer, batch_size, beam_size, min_length, max_length, alpha=0, block_repeating_trigram=True, ): """ Attributes: mask_word_id: token id that corresponds to the mask """ super(TransformerBeamSearch, self).__init__() self.model = model self.tokenizer = tokenizer self.start_token_id = tokenizer.start_token_id self.end_token_id = tokenizer.end_token_id self.pad_token_id = tokenizer.pad_token_id self.beam_size = beam_size self.min_length = min_length self.max_length = max_length self.block_repeating_trigram = block_repeating_trigram self.apply_length_penalty = False if alpha == 0 else True self.alpha = alpha # State of the beam self.hypotheses = [[] for _ in range(batch_size)] self.batch_offset = torch.arange(batch_size, dtype=torch.long) self.beam_offset = torch.arange( 0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long ) self.growing_beam = torch.full( (batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long ) self.topk_log_probabilities = torch.tensor( [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float ).repeat(batch_size) self.results = { "prediction": [[] for _ in batch_size], "scores": [[] for _ in batch_size], } self._step = 0 self.is_done = False def step(self, log_probabilities): """ Grows the beam by one step. """ self._step += 1 # The batch size changes as some beams finish so we define _B vocab_size = log_probabilities.size(-1) _B = log_probabilities.size(0) // self.beam_size # Multiply each beam probability with the probability of the # next token (conditioned on the words in the beam). log_probabilities += self.topk_log_probabilities.view(-1, 1) self.enforce_min_length(log_probabilities) if self.block_repeating_trigram: self.remove_repeating_trigrams(log_probabilities, _B) # Find the `beam_size` (previous_beam + token) combinations with # the highest score topk_log_probabilities, topk_ids = log_probabilities.topk( log_probabilities.view(_B, self.beam_size * vocab_size), self.beam_size, dim=1, ) # Apply the length penalty. The +1 accounts for the [EOS] token # that will be added if the beam ends. topk_scores = topk_log_probabilities / self.length_penalty() # Retrieve the corresponding respective beam and token id # topk_token_ids[i] will be added to topk_beam_ids[i] topk_beam_ids = topk_ids.div(vocab_size) topk_token_ids = topk_ids.fmod(vocab_size) # Retrieve the row index of the surviving beams in the original # view of the log_probabilities tensor surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view( -1 ) # Append the last predictions self.growing_beam = torch.cat( [ self.growing_beam.index_select(0, surviving_beams_rows), topk_token_ids.view(-1, 1), ], 1, ) # Check if any of the beam searches has ended during this # growth step. Also if top beam (most probable) has ended # for one element of the batch. is_finished = topk_token_ids.eq(self.end_token_id) self.enforce_max_length() is_top_beam_finished = is_finished[:, 0].eq(1) # Save the finished searches if is_finished.any(): predictions = self.growing_beam.view( -1, self.beam_size, self.growing_beam.size(1) ) for i in range(is_finished.size(0)): if is_top_beam_finished[i]: is_finished[i].fill_(1) finished_hyp = is_finished[i].nonzero().view(-1) # Store finished hypotheses for this batch. b = self.batch_offset[i] for j in finished_hyp: self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :])) # If the batch reached the end, save the best hypotheses # in terms of length-penalized score. if is_top_beam_finished[i]: best_hyp = sorted( self.hypotheses[b], key=lambda x: x[0], reverse=True ) best_score, best_prediction = best_hyp[0] self.results["scores"][b].append(best_score) self.results["predictions"][b].append(best_prediction) non_finished = is_top_beam_finished.eq(0).nonzero().view(-1) if len(non_finished) == 0: self.is_done = True # Remove finished batches for the next step. topk_log_probabilities = topk_log_probabilities.index_select( 0, non_finished ) self.batch_offset = self.batch_offset.index_select(0, non_finished) self.growing_beam = predictions.index_select(0, non_finished).view( -1, self.growing_beam.size(-1) ) surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished) return surviving_beams_rows def forward(self, encoder_input_ids, **kwargs): # keyword arguments come in 3 flavors: encoder-specific (prefixed by # `encoder_`), decoder-specific (prefixed by `decoder_`) and those # that apply to the model as whole. # We let the specific kwargs override the common ones in case of conflict. kwargs_encoder = { argument[len("encoder_"):]: value for argument, value in kwargs.items() if argument.startswith("encoder_") } kwargs_decoder = { argument[len("decoder_"):]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } kwargs_common = { argument: value for argument, value in kwargs.items() if not (argument.startswith("encoder_") or argument.startswith("decoder_")) } kwargs_decoder = dict(kwargs_common, **kwargs_decoder) kwargs_encoder = dict(kwargs_common, **kwargs_encoder) # forward pass on the encoder encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder) kwargs_decoder["encoder_hidden_states"] = tile( encoder_outputs, self.beam_size, dim=0 ) # grow the beam by generating sequences in an autoregressive way self.growing_beam = torch.full( (self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long ) for step in range(self.max_length): decoder_input = self.growing_beam[:, -1] outputs = self.model.decoder(decoder_input, kwargs_decoder) log_probabilities = torch.nn.functional.log_softmax(outputs[1]) surviving_beams_rows = self.step(log_probabilities) if self.is_done: break kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[ "encoder_hidden_states" ].index_select(0, surviving_beams_rows) return self.results def remove_repeating_trigrams(self, log_probabilities, _B): if(self._step + 1 > 3): for i in range(_B * self.beam_size): tokens = [t for t in self.growing_beam[i]] trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)] last_trigram = tuple(trigrams[-1]) if last_trigram in trigrams[:-1]: log_probabilities[i] = -1e20 def enforce_min_length(self): if self._step < self.min_length: self.log_probabilities[self.end_token_id] = -1e20 def enforce_max_length(self): if self._step + 1 == self.max_length: self.is_finished.fill_(1) def length_penalty(self): return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha def tile(x, count, dim=0): """ Tiles `x` along dimension `dim` `count` times. Example: >> ex = torch.tensor([1,2],[3,4]) >> tile(ex, 2, 0) torch.Tensor([[1,2],[1,2],[3,4],[3,4]]) """ perm = list(range(len(x.size()))) if dim != 0: perm[0], perm[dim] = perm[dim], perm[0] x = x.permute(perm).contiguous() out_size = list(x.size()) out_size[0] *= count batch = x.size(0) x = ( x.view(batch, -1) .transpose(0, 1) .repeat(count, 1) .transpose(0, 1) .contiguous() .view(*out_size) ) if dim != 0: x = x.permute(perm).contiguous() return x ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch BERT model. """ import logging import math import os import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from .activations import gelu, gelu_new, swish from .configuration_bert import BertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer logger = logging.getLogger(__name__) BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased", "bert-base-chinese", "bert-base-german-cased", "bert-large-uncased-whole-word-masking", "bert-large-cased-whole-word-masking", "bert-large-uncased-whole-word-masking-finetuned-squad", "bert-large-cased-whole-word-masking-finetuned-squad", "bert-base-cased-finetuned-mrpc", "bert-base-german-dbmdz-cased", "bert-base-german-dbmdz-uncased", "cl-tohoku/bert-base-japanese", "cl-tohoku/bert-base-japanese-whole-word-masking", "cl-tohoku/bert-base-japanese-char", "cl-tohoku/bert-base-japanese-char-whole-word-masking", "TurkuNLP/bert-base-finnish-cased-v1", "TurkuNLP/bert-base-finnish-uncased-v1", "wietsedv/bert-base-dutch-cased", # See all BERT models at https://huggingface.co/models?filter=bert ] def load_tf_weights_in_bert(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model. """ try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] for n in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model def mish(x): return x * torch.tanh(nn.functional.softplus(x)) ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish} BertLayerNorm = torch.nn.LayerNorm class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] device = input_ids.device if input_ids is not None else inputs_embeds.device if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand(input_shape) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. if encoder_hidden_states is not None: mixed_key_layer = self.key(encoder_hidden_states) mixed_value_layer = self.value(encoder_hidden_states) attention_mask = encoder_attention_mask else: mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class BertSelfOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertAttention(nn.Module): def __init__(self, config): super().__init__() self.self = BertSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class BertIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class BertOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertLayer(nn.Module): def __init__(self, config): super().__init__() self.attention = BertAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights if self.is_decoder and encoder_hidden_states is not None: cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) outputs = (layer_output,) + outputs return outputs class BertEncoder(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask ) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class BertPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states class BertOnlyMLMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class BertOnlyNSPHead(nn.Module): def __init__(self, config): super().__init__() self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) return seq_relationship_score class BertPreTrainingHeads(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, sequence_output, pooled_output): prediction_scores = self.predictions(sequence_output) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score class BertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, BertLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() BERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ BERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class BertModel(BertPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration set to :obj:`True`; an :obj:`encoder_hidden_states` is expected as an input to the forward pass. .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762 """ def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class BertForPreTraining(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertPreTrainingHeads(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, next_sentence_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForPreTraining import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) outputs = (prediction_scores, seq_relationship_score,) + outputs[ 2: ] # add hidden states and attention if they are here if masked_lm_labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): Next token prediction loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForMaskedLM import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here # Although this may seem awkward, BertForMaskedLM supports two scenarios: # 1. If a tensor that contains the indices of masked labels is provided, # the cross-entropy is the MLM cross-entropy that measures the likelihood # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs if lm_labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one prediction_scores = prediction_scores[:, :-1, :].contiguous() lm_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss() ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1)) outputs = (ltr_lm_loss,) + outputs return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # if model is does not use a causal mask then add a dummy token if self.config.is_decoder is False: assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" attention_mask = torch.cat( [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 ) dummy_token = torch.full( (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device ) input_ids = torch.cat([input_ids, dummy_token], dim=1) return {"input_ids": input_ids, "attention_mask": attention_mask} @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class BertForNextSentencePrediction(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, next_sentence_label=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForNextSentencePrediction import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." next_sentence = "The sky is blue due to the shorter wavelength of blue light." encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt') loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1])) assert logits[0, 0] < logits[0, 1] # next sentence was random """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if next_sentence_label is not None: loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) outputs = (next_sentence_loss,) + outputs return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class BertForSequenceClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class BertForMultipleChoice(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForMultipleChoice import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." choice0 = "It is eaten with a fork and a knife." choice1 = "It is eaten while held in the hand." labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;)) encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True) outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 # the linear classifier still needs to be trained loss, logits = outputs[:2] """ num_choices = input_ids.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class BertForTokenClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForTokenClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class BertForQuestionAnswering(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForQuestionAnswering import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) assert answer == "a nice puppet" """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_camembert.py ================================================ # coding=utf-8 # Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch CamemBERT model. """ import logging from .configuration_camembert import CamembertConfig from .file_utils import add_start_docstrings from .modeling_roberta import ( RobertaForMaskedLM, RobertaForMultipleChoice, RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, ) logger = logging.getLogger(__name__) CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "camembert-base", "Musixmatch/umberto-commoncrawl-cased-v1", "Musixmatch/umberto-wikipedia-uncased-v1", # See all CamemBERT models at https://huggingface.co/models?filter=camembert ] CAMEMBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.CamembertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", CAMEMBERT_START_DOCSTRING, ) class CamembertModel(RobertaModel): """ This class overrides :class:`~transformers1.RobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForMaskedLM(RobertaForMaskedLM): """ This class overrides :class:`~transformers1.RobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForSequenceClassification(RobertaForSequenceClassification): """ This class overrides :class:`~transformers1.RobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForMultipleChoice(RobertaForMultipleChoice): """ This class overrides :class:`~transformers1.RobertaForMultipleChoice`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, CAMEMBERT_START_DOCSTRING, ) class CamembertForTokenClassification(RobertaForTokenClassification): """ This class overrides :class:`~transformers1.RobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits` """, CAMEMBERT_START_DOCSTRING, ) class CamembertForQuestionAnswering(RobertaForQuestionAnswering): """ This class overrides :class:`~transformers1.RobertaForQuestionAnswering`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch CTRL model.""" import logging import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .configuration_ctrl import CTRLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel logger = logging.getLogger(__name__) CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "ctrl" # See all CTRL models at https://huggingface.co/models?filter=ctrl ] def angle_defn(pos, i, d_model_size): angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size) return pos * angle_rates def positional_encoding(position, d_model_size, dtype): # create the sinusoidal pattern for the positional encoding angle_rads = angle_defn( torch.arange(position, dtype=dtype).unsqueeze(1), torch.arange(d_model_size, dtype=dtype).unsqueeze(0), d_model_size, ) sines = torch.sin(angle_rads[:, 0::2]) cosines = torch.cos(angle_rads[:, 1::2]) pos_encoding = torch.cat([sines, cosines], dim=-1) return pos_encoding def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): # calculate attention matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2)) dk = k.shape[-1] scaled_attention_logits = matmul_qk / np.sqrt(dk) if mask is not None: nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1) scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4 if attention_mask is not None: # Apply the attention mask scaled_attention_logits = scaled_attention_logits + attention_mask attention_weights = torch.softmax(scaled_attention_logits, dim=-1) # Mask heads if we want to if head_mask is not None: attention_weights = attention_weights * head_mask output = torch.matmul(attention_weights, v) return output, attention_weights class MultiHeadAttention(torch.nn.Module): def __init__(self, d_model_size, num_heads, output_attentions=False): super().__init__() self.output_attentions = output_attentions self.num_heads = num_heads self.d_model_size = d_model_size self.depth = int(d_model_size / self.num_heads) self.Wq = torch.nn.Linear(d_model_size, d_model_size) self.Wk = torch.nn.Linear(d_model_size, d_model_size) self.Wv = torch.nn.Linear(d_model_size, d_model_size) self.dense = torch.nn.Linear(d_model_size, d_model_size) def split_into_heads(self, x, batch_size): x = x.reshape(batch_size, -1, self.num_heads, self.depth) return x.permute([0, 2, 1, 3]) def forward(self, v, k, q, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): batch_size = q.shape[0] q = self.Wq(q) k = self.Wk(k) v = self.Wv(v) q = self.split_into_heads(q, batch_size) k = self.split_into_heads(k, batch_size) v = self.split_into_heads(v, batch_size) if layer_past is not None: past_key, past_value = layer_past[0], layer_past[1] k = torch.cat((past_key, k), dim=-2) v = torch.cat((past_value, v), dim=-2) if use_cache is True: present = torch.stack((k, v)) else: present = (None,) output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) scaled_attention = output[0].permute([0, 2, 1, 3]) attn = output[1] original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size) output = self.dense(original_size_attention) outputs = (output, present) if self.output_attentions: outputs = outputs + (attn,) return outputs def point_wise_feed_forward_network(d_model_size, dff): return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size)) class EncoderLayer(torch.nn.Module): def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False): super().__init__() self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions) self.ffn = point_wise_feed_forward_network(d_model_size, dff) self.layernorm1 = torch.nn.LayerNorm(d_model_size, eps=1e-6) self.layernorm2 = torch.nn.LayerNorm(d_model_size, eps=1e-6) self.dropout1 = torch.nn.Dropout(rate) self.dropout2 = torch.nn.Dropout(rate) def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): normed = self.layernorm1(x) attn_outputs = self.multi_head_attention( normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) attn_output = attn_outputs[0] attn_output = self.dropout1(attn_output) out1 = x + attn_output out2 = self.layernorm2(out1) ffn_output = self.ffn(out2) ffn_output = self.dropout2(ffn_output) out2 = out1 + ffn_output outputs = (out2,) + attn_outputs[1:] return outputs class CTRLPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = CTRLConfig base_model_prefix = "transformer" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) CTRL_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.CTRLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ CTRL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only input_ids that do not have their past calculated should be passed as input_ids. Indices can be obtained using :class:`transformers1.CTRLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The input_ids which have their past given to this model should not be passed as input ids as they have already been computed. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If `past` is used, optionally only the last `inputs_embeds` have to be input (see `past`). use_cache (:obj:`bool`): If `use_cache` is True, `past` key value states are returned and can be used to speed up decoding (see `past`). Defaults to `True`. """ @add_start_docstrings( "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", CTRL_START_DOCSTRING, ) class CTRLModel(CTRLPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.d_model_size = config.n_embd self.num_layers = config.n_layer self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float) self.w = nn.Embedding(config.vocab_size, config.n_embd) self.dropout = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList( [ EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions) for _ in range(config.n_layer) ] ) self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.init_weights() def get_input_embeddings(self): return self.w def set_input_embeddings(self, new_embeddings): self.w = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import CTRLTokenizer, CTRLModel import torch tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLModel.from_pretrained('ctrl') input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = past[0][0].size(-2) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layer) if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) token_type_embeds = self.w(token_type_ids) token_type_embeds *= np.sqrt(self.d_model_size) else: token_type_embeds = 0 position_ids = position_ids.view(-1, input_shape[-1]) if inputs_embeds is None: inputs_embeds = self.w(input_ids) # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded seq_len = input_shape[-1] mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(inputs_embeds.device) inputs_embeds *= np.sqrt(self.d_model_size) pos_embeds = self.pos_encoding[position_ids, :].to(inputs_embeds.device) hidden_states = inputs_embeds + pos_embeds + token_type_embeds hidden_states = self.dropout(hidden_states) output_shape = input_shape + (inputs_embeds.size(-1),) presents = () all_hidden_states = () all_attentions = [] for i, (h, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = h( hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], use_cache=use_cache, ) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.layernorm(hidden_states) hidden_states = hidden_states.view(*output_shape) if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs @add_start_docstrings( """The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, ) class CTRLLMHeadModel(CTRLPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = CTRLModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) self.init_weights() def get_output_embeddings(self): return self.lm_head def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: input_ids = input_ids[:, -1].unsqueeze(-1) return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=True, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import CTRLTokenizer, CTRLLMHeadModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLLMHeadModel.from_pretrained('ctrl') input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) """ import copy import logging import math import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .activations import gelu from .configuration_distilbert import DistilBertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "distilbert-base-uncased", "distilbert-base-uncased-distilled-squad", "distilbert-base-cased", "distilbert-base-cased-distilled-squad", "distilbert-base-german-cased", "distilbert-base-multilingual-cased", "distilbert-base-uncased-finetuned-sst-2-english", # See all DistilBERT models at https://huggingface.co/models?filter=distilbert ] # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False class Embeddings(nn.Module): def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: create_sinusoidal_embeddings( n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight ) self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) self.dropout = nn.Dropout(config.dropout) def forward(self, input_ids): """ Parameters ---------- input_ids: torch.tensor(bs, max_seq_length) The token ids to embed. Outputs ------- embeddings: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ seq_length = input_ids.size(1) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) return embeddings class MultiHeadSelfAttention(nn.Module): def __init__(self, config): super().__init__() self.n_heads = config.n_heads self.dim = config.dim self.dropout = nn.Dropout(p=config.attention_dropout) self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0 self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.pruned_heads = set() def prune_heads(self, heads): attention_head_size = self.dim // self.n_heads if len(heads) == 0: return mask = torch.ones(self.n_heads, attention_head_size) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.q_lin = prune_linear_layer(self.q_lin, index) self.k_lin = prune_linear_layer(self.k_lin, index) self.v_lin = prune_linear_layer(self.v_lin, index) self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, query, key, value, mask, head_mask=None): """ Parameters ---------- query: torch.tensor(bs, seq_length, dim) key: torch.tensor(bs, seq_length, dim) value: torch.tensor(bs, seq_length, dim) mask: torch.tensor(bs, seq_length) Outputs ------- weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ bs, q_length, dim = query.size() k_length = key.size(1) # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) # assert key.size() == value.size() dim_per_head = self.dim // self.n_heads mask_reshp = (bs, 1, 1, k_length) def shape(x): """ separate heads """ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) def unshape(x): """ group heads """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, q_length, k_length) weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) if self.output_attentions: return (context, weights) else: return (context,) class FFN(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(p=config.dropout) self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format( config.activation ) self.activation = gelu if config.activation == "gelu" else nn.ReLU() def forward(self, input): x = self.lin1(input) x = self.activation(x) x = self.lin2(x) x = self.dropout(x) return x class TransformerBlock(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions assert config.dim % config.n_heads == 0 self.attention = MultiHeadSelfAttention(config) self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) self.ffn = FFN(config) self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) def forward(self, x, attn_mask=None, head_mask=None): """ Parameters ---------- x: torch.tensor(bs, seq_length, dim) attn_mask: torch.tensor(bs, seq_length) Outputs ------- sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ # Self-Attention sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask) if self.output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples assert type(sa_output) == tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) if self.output_attentions: output = (sa_weights,) + output return output class Transformer(nn.Module): def __init__(self, config): super().__init__() self.n_layers = config.n_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states layer = TransformerBlock(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) def forward(self, x, attn_mask=None, head_mask=None): """ Parameters ---------- x: torch.tensor(bs, seq_length, dim) Input sequence embedded. attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. Outputs ------- hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top) layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)] Tuple of length n_layers with the hidden states from each layer. Optional: only if output_hidden_states=True all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ all_hidden_states = () all_attentions = () hidden_state = x for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i]) hidden_state = layer_outputs[-1] if self.output_attentions: assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1 # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) outputs = (hidden_state,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = DistilBertConfig load_tf_weights = None base_model_prefix = "distilbert" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, nn.Embedding): if module.weight.requires_grad: module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() DISTILBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ DISTILBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.DistilBertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", DISTILBERT_START_DOCSTRING, ) class DistilBertModel(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.transformer.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertModel import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertModel.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask) hidden_state = tfmr_output[0] output = (hidden_state,) + tfmr_output[1:] return output # last-layer hidden-state, (all hidden_states), (all attentions) @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMaskedLM(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.distilbert = DistilBertModel(config) self.vocab_transform = nn.Linear(config.dim, config.dim) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.init_weights() self.mlm_loss_fct = nn.CrossEntropyLoss() def get_output_embeddings(self): return self.vocab_projector @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForMaskedLM import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) outputs = (prediction_logits,) + dlbrt_output[1:] if masked_lm_labels is not None: mlm_loss = self.mlm_loss_fct( prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1) ) outputs = (mlm_loss,) + outputs return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) @add_start_docstrings( """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForSequenceClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForSequenceClassification import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim) outputs = (logits,) + distilbert_output[1:] if labels is not None: if self.num_labels == 1: loss_fct = nn.MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, DISTILBERT_START_DOCSTRING, ) class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.qa_outputs = nn.Linear(config.dim, config.num_labels) assert config.num_labels == 2 self.dropout = nn.Dropout(config.qa_dropout) self.init_weights() @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForQuestionAnswering import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss, start_scores, end_scores = outputs[:3] """ distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) # (bs, max_query_len) end_logits = end_logits.squeeze(-1) # (bs, max_query_len) outputs = (start_logits, end_logits,) + distilbert_output[1:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForTokenClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.DistilBertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import DistilBertTokenizer, DistilBertForTokenClassification import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_electra.py ================================================ import logging import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .activations import get_activation from .configuration_electra import ElectraConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel logger = logging.getLogger(__name__) ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/electra-small-generator", "google/electra-base-generator", "google/electra-large-generator", "google/electra-small-discriminator", "google/electra-base-discriminator", "google/electra-large-discriminator", # See all ELECTRA models at https://huggingface.co/models?filter=electra ] def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"): """ Load tf checkpoints in a pytorch model. """ try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): original_name: str = name try: if isinstance(model, ElectraForMaskedLM): name = name.replace("electra/embeddings/", "generator/embeddings/") if discriminator_or_generator == "generator": name = name.replace("electra/", "discriminator/") name = name.replace("generator/", "electra/") name = name.replace("dense_1", "dense_prediction") name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias") name = name.split("/") # print(original_name, name) # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["global_step", "temperature"] for n in name): logger.info("Skipping {}".format(original_name)) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: pointer = getattr(pointer, scope_names[0]) if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name.endswith("_embeddings"): pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape, original_name except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name), original_name) pointer.data = torch.from_numpy(array) except AttributeError as e: print("Skipping {}".format(original_name), name, e) continue return model class ElectraEmbeddings(BertEmbeddings): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config): super().__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.embedding_size, eps=config.layer_norm_eps) class ElectraDiscriminatorPredictions(nn.Module): """Prediction module for the discriminator, made up of two dense layers.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense_prediction = nn.Linear(config.hidden_size, 1) self.config = config def forward(self, discriminator_hidden_states, attention_mask): hidden_states = self.dense(discriminator_hidden_states) hidden_states = get_activation(self.config.hidden_act)(hidden_states) logits = self.dense_prediction(hidden_states).squeeze() return logits class ElectraGeneratorPredictions(nn.Module): """Prediction module for the generator, made up of two dense layers.""" def __init__(self, config): super().__init__() self.LayerNorm = BertLayerNorm(config.embedding_size) self.dense = nn.Linear(config.hidden_size, config.embedding_size) def forward(self, generator_hidden_states): hidden_states = self.dense(generator_hidden_states) hidden_states = get_activation("gelu")(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class ElectraPreTrainedModel(BertPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = ElectraConfig load_tf_weights = load_tf_weights_in_electra base_model_prefix = "electra" ELECTRA_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.ElectraConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ELECTRA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.ElectraTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ @add_start_docstrings( "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to " "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the " "hidden size and embedding size are different." "" "Both the generator and discriminator checkpoints may be loaded into this model.", ELECTRA_START_DOCSTRING, ) class ElectraModel(ElectraPreTrainedModel): config_class = ElectraConfig def __init__(self, config): super().__init__(config) self.embeddings = ElectraEmbeddings(config) if config.embedding_size != config.hidden_size: self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) self.encoder = BertEncoder(config) self.config = config self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraModel, ElectraTokenizer import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraModel.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) hidden_states = self.encoder(hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask) return hidden_states class ElectraClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x @add_start_docstrings( """ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ELECTRA_START_DOCSTRING, ) class ElectraForSequenceClassification(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.electra = ElectraModel(config) self.classifier = ElectraClassificationHead(config) self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) sequence_output = discriminator_hidden_states[0] logits = self.classifier(sequence_output) outputs = (logits,) + discriminator_hidden_states[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a binary classification head on top as used during pre-training for identifying generated tokens. It is recommended to load the discriminator checkpoint into that model.""", ELECTRA_START_DOCSTRING, ) class ElectraForPreTraining(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.electra = ElectraModel(config) self.discriminator_predictions = ElectraDiscriminatorPredictions(config) self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates the token is an original token, ``1`` indicates the token was replaced. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss of the ELECTRA objective. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`) Prediction scores of the head (scores for each token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraTokenizer, ElectraForPreTraining import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) discriminator_sequence_output = discriminator_hidden_states[0] logits = self.discriminator_predictions(discriminator_sequence_output, attention_mask) output = (logits,) if labels is not None: loss_fct = nn.BCEWithLogitsLoss() if attention_mask is not None: active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1 active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss] active_labels = labels[active_loss] loss = loss_fct(active_logits, active_labels.float()) else: loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) output = (loss,) + output output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a language modeling head on top. Even though both the discriminator and generator may be loaded into this model, the generator is the only model of the two to have been trained for the masked language modeling task.""", ELECTRA_START_DOCSTRING, ) class ElectraForMaskedLM(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.electra = ElectraModel(config) self.generator_predictions = ElectraGeneratorPredictions(config) self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) self.init_weights() def get_output_embeddings(self): return self.generator_lm_head @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraTokenizer, ElectraForMaskedLM import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ generator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) generator_sequence_output = generator_hidden_states[0] prediction_scores = self.generator_predictions(generator_sequence_output) prediction_scores = self.generator_lm_head(prediction_scores) output = (prediction_scores,) # Masked language modeling softmax layer if masked_lm_labels is not None: loss_fct = nn.CrossEntropyLoss() # -100 index = padding token loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) output = (loss,) + output output += generator_hidden_states[1:] return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a token classification head on top. Both the discriminator and generator may be loaded into this model.""", ELECTRA_START_DOCSTRING, ) class ElectraForTokenClassification(ElectraPreTrainedModel): def __init__(self, config): super().__init__(config) self.electra = ElectraModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ElectraTokenizer, ElectraForTokenClassification import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds ) discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) output = (logits,) if labels is not None: loss_fct = nn.CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.config.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) output = (loss,) + output output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_encoder_decoder.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Classes to support Encoder-Decoder architectures """ import logging from typing import Optional from .configuration_encoder_decoder import EncoderDecoderConfig from .configuration_utils import PretrainedConfig from .modeling_utils import PreTrainedModel logger = logging.getLogger(__name__) class EncoderDecoderModel(PreTrainedModel): r""" :class:`~transformers1.EncoderDecoder` is a generic model class that will be instantiated as a transformer architecture with one of the base model classes of the library as encoder and another one as decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class method for the encoder and `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` class method for the decoder. """ config_class = EncoderDecoderConfig base_model_prefix = "encoder_decoder" def __init__( self, config: Optional[PretrainedConfig] = None, encoder: Optional[PreTrainedModel] = None, decoder: Optional[PreTrainedModel] = None, ): assert config is not None or ( encoder is not None and decoder is not None ), "Either a configuration or an Encoder and a decoder has to be provided" if config is None: config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config) else: assert isinstance(config, self.config_class), "config: {} has to be of type {}".format( config, self.config_class ) # initialize with config super().__init__(config) if encoder is None: from transformers import AutoModel encoder = AutoModel.from_config(config.encoder) if decoder is None: from transformers import AutoModelWithLMHead decoder = AutoModelWithLMHead.from_config(config.decoder) self.encoder = encoder self.decoder = decoder assert ( self.encoder.get_output_embeddings() is None ), "The encoder {} should not have a LM Head. Please use a model without LM Head" def tie_weights(self): # for now no weights tying in encoder-decoder pass def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder def get_input_embeddings(self): return self.encoder.get_input_embeddings() def get_output_embeddings(self): return self.decoder.get_output_embeddings() @classmethod def from_encoder_decoder_pretrained( cls, encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, **kwargs ) -> PreTrainedModel: r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints. The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train the model, you need to first set it back in training mode with `model.train()`. Params: encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): information necessary to initiate the encoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): information necessary to initiate the decoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method kwargs: (`optional`) Remaining dictionary of keyword arguments. Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Examples:: from transformers1 import EncoderDecoder model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert """ kwargs_encoder = { argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") } kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } # Load and initialize the encoder and decoder # The distinction between encoder and decoder at the model level is made # by the value of the flag `is_decoder` that we need to set correctly. encoder = kwargs_encoder.pop("model", None) if encoder is None: assert ( encoder_pretrained_model_name_or_path is not None ), "If `model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has to be defined" from .modeling_auto import AutoModel encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) encoder.config.is_decoder = False decoder = kwargs_decoder.pop("model", None) if decoder is None: assert ( decoder_pretrained_model_name_or_path is not None ), "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has to be defined" from .modeling_auto import AutoModelWithLMHead if "config" not in kwargs_decoder: from transformers import AutoConfig decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path) if decoder_config.is_decoder is False: logger.info( f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." ) decoder_config.is_decoder = True kwargs_decoder["config"] = decoder_config if kwargs_decoder["config"].is_decoder is False: logger.warning( f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, make sure that the attribute `is_decoder` of `decoder_config` passed to `.from_encoder_decoder_pretrained(...)` is set to `True` or do not pass a `decoder_config` to `.from_encoder_decoder_pretrained(...)`" ) decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) return cls(encoder=encoder, decoder=decoder) def forward( self, input_ids=None, inputs_embeds=None, attention_mask=None, head_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_head_mask=None, decoder_inputs_embeds=None, masked_lm_labels=None, lm_labels=None, **kwargs, ): """ Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary for the encoder. Indices can be obtained using :class:`transformers1.PretrainedTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices for the encoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. head_mask: (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules for the encoder. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for sequence to sequence training to the decoder. Indices can be obtained using :class:`transformers1.PretrainedTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. decoder_head_mask: (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules for the decoder. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss for the decoder. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction) for the decoder. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` kwargs: (`optional`) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors: - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function. - With a `decoder_` prefix which will be input as `**decoder_kwargs` for the decoder forward function. Examples:: from transformers1 import EncoderDecoderModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert # forward input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) # training loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2] # generation generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) """ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, **kwargs_encoder, ) hidden_states = encoder_outputs[0] # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, inputs_embeds=decoder_inputs_embeds, attention_mask=decoder_attention_mask, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, lm_labels=lm_labels, masked_lm_labels=masked_lm_labels, **kwargs_decoder, ) return decoder_outputs + encoder_outputs def prepare_inputs_for_generation(self, input_ids, past, attention_mask, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step if type(past) is tuple: encoder_outputs = past else: encoder_outputs = (past,) decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids) return { "attention_mask": attention_mask, "decoder_attention_mask": decoder_inputs["attention_mask"], "decoder_input_ids": decoder_inputs["input_ids"], "encoder_outputs": encoder_outputs, } def _reorder_cache(self, past, beam_idx): # as a default encoder-decoder models do not re-order the past. # TODO(PVP): might have to be updated, e.g. if GPT2 is to be used as a decoder return past ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Flaubert model, based on XLM. """ import logging import random import torch from torch.nn import functional as F from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_xlm import ( XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, XLMForSequenceClassification, XLMModel, XLMWithLMHeadModel, get_masks, ) logger = logging.getLogger(__name__) FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "flaubert/flaubert_small_cased", "flaubert/flaubert_base_uncased", "flaubert/flaubert_base_cased", "flaubert/flaubert_large_cased", # See all Flaubert models at https://huggingface.co/models?filter=flaubert ] FLAUBERT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.FlaubertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ FLAUBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`): dictionary with ``torch.FloatTensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", FLAUBERT_START_DOCSTRING, ) class FlaubertModel(XLMModel): config_class = FlaubertConfig def __init__(self, config): # , dico, is_encoder, with_output): super().__init__(config) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import FlaubertTokenizer, FlaubertModel import torch tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') model = FlaubertModel.from_pretrained('flaubert-base-cased') input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # removed: src_enc=None, src_len=None if input_ids is not None: bs, slen = input_ids.size() else: bs, slen = inputs_embeds.size()[:-1] if lengths is None: if input_ids is not None: lengths = (input_ids != self.pad_index).sum(dim=1).long() else: lengths = torch.LongTensor([slen] * bs) # mask = input_ids != self.pad_index # check inputs assert lengths.size(0) == bs assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] device = input_ids.device if input_ids is not None else inputs_embeds.device # position_ids if position_ids is None: position_ids = torch.arange(slen, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand((bs, slen)) else: assert position_ids.size() == (bs, slen) # (slen, bs) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: assert langs.size() == (bs, slen) # (slen, bs) # langs = langs.transpose(0, 1) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layers) # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds) if langs is not None and self.use_lang_emb and self.config.n_langs > 1: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = F.dropout(tensor, p=self.dropout, training=self.training) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): # LayerDrop dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): continue if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention if not self.pre_norm: attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i]) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) else: tensor_normalized = self.layer_norm1[i](tensor) attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i]) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN if not self.pre_norm: tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) else: tensor_normalized = self.layer_norm2[i](tensor) tensor = tensor + self.ffns[i](tensor_normalized) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) @add_start_docstrings( """The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, FLAUBERT_START_DOCSTRING, ) class FlaubertWithLMHeadModel(XLMWithLMHeadModel): """ This class overrides :class:`~transformers1.XLMWithLMHeadModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() @add_start_docstrings( """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, FLAUBERT_START_DOCSTRING, ) class FlaubertForSequenceClassification(XLMForSequenceClassification): """ This class overrides :class:`~transformers1.XLMForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() @add_start_docstrings( """Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, FLAUBERT_START_DOCSTRING, ) class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple): """ This class overrides :class:`~transformers1.XLMForQuestionAnsweringSimple`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() @add_start_docstrings( """Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, FLAUBERT_START_DOCSTRING, ) class FlaubertForQuestionAnswering(XLMForQuestionAnswering): """ This class overrides :class:`~transformers1.XLMForQuestionAnswering`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch OpenAI GPT-2 model.""" import logging import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .activations import ACT2FN from .configuration_gpt2 import GPT2Config from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer logger = logging.getLogger(__name__) GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "distilgpt2", # See all GPT-2 models at https://huggingface.co/models?filter=gpt2 ] def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): """ Load tf checkpoints in a pytorch model """ try: import re import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(gpt2_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array.squeeze()) for name, array in zip(names, arrays): name = name[6:] # skip "model/" name = name.split("/") pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+\d+", m_name): scope_names = re.split(r"(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "w" or scope_names[0] == "g": pointer = getattr(pointer, "weight") elif scope_names[0] == "b": pointer = getattr(pointer, "bias") elif scope_names[0] == "wpe" or scope_names[0] == "wte": pointer = getattr(pointer, scope_names[0]) pointer = getattr(pointer, "weight") else: pointer = getattr(pointer, scope_names[0]) if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): super().__init__() self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.register_buffer( "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx) ) self.register_buffer("masked_bias", torch.tensor(-1e4)) self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = Conv1D(n_state * 3, nx) self.c_proj = Conv1D(n_state, nx) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.n_head, self.split_size // self.n_head) heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) # Prune conv1d layers self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) # Update hyper params self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) self.n_head = self.n_head - len(heads) self.pruned_heads = self.pruned_heads.union(heads) def _attn(self, q, k, v, attention_mask=None, head_mask=None): w = torch.matmul(q, k) if self.scale: w = w / (float(v.size(-1)) ** 0.5) nd, ns = w.size(-2), w.size(-1) mask = self.bias[:, :, ns - nd : ns, :ns] w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype)) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = nn.Softmax(dim=-1)(w) w = self.attn_dropout(w) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [torch.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = x.permute(0, 2, 1, 3).contiguous() new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states def split_heads(self, x, k=False): new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states if k: return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length) else: return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) def forward(self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): x = self.c_attn(x) query, key, value = x.split(self.split_size, dim=2) query = self.split_heads(query) key = self.split_heads(key, k=True) value = self.split_heads(value) if layer_past is not None: past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below key = torch.cat((past_key, key), dim=-1) value = torch.cat((past_value, value), dim=-2) if use_cache is True: present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking else: present = (None,) attn_outputs = self._attn(query, key, value, attention_mask, head_mask) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a) outputs = [a, present] + attn_outputs[1:] return outputs # a, present, (attentions) class MLP(nn.Module): def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.act = ACT2FN[config.activation_function] self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, x): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) return self.dropout(h2) class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super().__init__() nx = config.n_embd self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) def forward(self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False): output_attn = self.attn( self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) a = output_attn[0] # output_attn: a, present, (attentions) x = x + a m = self.mlp(self.ln_2(x)) x = x + m outputs = [x] + output_attn[1:] return outputs # x, present, (attentions) class GPT2PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = GPT2Config load_tf_weights = load_tf_weights_in_gpt2 base_model_prefix = "transformer" def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) GPT2_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.GPT2Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ GPT2_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only `input_ids` that do not have their past calculated should be passed as `input_ids`. Indices can be obtained using :class:`transformers1.GPT2Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The `input_ids` which have their past given to this model should not be passed as `input_ids` as they have already been computed. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`, defaults to :obj:`None`): `input_ids_length` = `sequence_length if `past` is None else 1 Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If `past` is used, optionally only the last `inputs_embeds` have to be input (see `past`). use_cache (:obj:`bool`): If `use_cache` is True, `past` key value states are returned and can be used to speed up decoding (see `past`). Defaults to `True`. """ @add_start_docstrings( "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", GPT2_START_DOCSTRING, ) class GPT2Model(GPT2PreTrainedModel): def __init__(self, config): super().__init__(config) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.init_weights() def get_input_embeddings(self): return self.wte def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. If `past` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import GPT2Tokenizer, GPT2Model import torch tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = past[0][0].size(-2) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # head_mask has shape n_layer x batch x n_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.n_layer) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) presents = () all_attentions = [] all_hidden_states = () for i, (block, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], use_cache=use_cache, ) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.ln_f(hidden_states) hidden_states = hidden_states.view(*output_shape) # Add last hidden state if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs # last hidden state, (presents), (all hidden_states), (attentions) @add_start_docstrings( """The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, ) class GPT2LMHeadModel(GPT2PreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights() def get_output_embeddings(self): return self.lm_head def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: input_ids = input_ids[:, -1].unsqueeze(-1) return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=True, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import GPT2Tokenizer, GPT2LMHeadModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) @add_start_docstrings( """The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, GPT2_START_DOCSTRING, ) class GPT2DoubleHeadsModel(GPT2PreTrainedModel): def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.init_weights() def get_output_embeddings(self): return self.lm_head @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, lm_labels=None, mc_labels=None, use_cache=True, ): r""" mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`) Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): Language modeling loss. mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): Multiple choice classification loss. lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import GPT2Tokenizer, GPT2DoubleHeadsModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') # Add a [CLS] to the vocabulary (we should train it also!) tokenizer.add_special_tokens({'cls_token': '[CLS]'}) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] encoded_choices = [tokenizer.encode(s) for s in choices] cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_longformer.py ================================================ # coding=utf-8 # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch Longformer model. """ import logging import math import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F from .configuration_longformer import LongformerConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import BertPreTrainedModel from .modeling_roberta import RobertaLMHead, RobertaModel logger = logging.getLogger(__name__) LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "allenai/longformer-base-4096", "allenai/longformer-large-4096", "allenai/longformer-large-4096-finetuned-triviaqa", "allenai/longformer-base-4096-extra.pos.embd.only", "allenai/longformer-large-4096-extra.pos.embd.only", # See all Longformer models at https://huggingface.co/models?filter=longformer ] def _get_question_end_index(input_ids, sep_token_id): """ Computes the index of the first occurance of `sep_token_id`. """ sep_token_indices = (input_ids == sep_token_id).nonzero() batch_size = input_ids.shape[0] assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions" assert ( sep_token_indices.shape[0] == 3 * batch_size ), f"There should be exactly three separator tokens: {sep_token_id} in every sample for questions answering. You might also consider to set `global_attention_mask` manually in the forward function to avoid this error." return sep_token_indices.view(batch_size, 3, 2)[:, 0, 1] def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True): """ Computes global attention mask by putting attention on all tokens before `sep_token_id` if `before_sep_token is True` else after `sep_token_id`. """ question_end_index = _get_question_end_index(input_ids, sep_token_id) question_end_index = question_end_index.unsqueeze(dim=1) # size: batch_size x 1 # bool attention mask with True in locations of global attention attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device) if before_sep_token is True: attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.uint8) else: # last token is separation token and should not be counted and in the middle are two separation tokens attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.uint8) * ( attention_mask.expand_as(input_ids) < input_ids.shape[-1] ).to(torch.uint8) return attention_mask class LongformerSelfAttention(nn.Module): def __init__(self, config, layer_id): super().__init__() if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_heads = config.num_attention_heads self.head_dim = int(config.hidden_size / config.num_attention_heads) self.embed_dim = config.hidden_size self.query = nn.Linear(config.hidden_size, self.embed_dim) self.key = nn.Linear(config.hidden_size, self.embed_dim) self.value = nn.Linear(config.hidden_size, self.embed_dim) # separate projection layers for tokens with global attention self.query_global = nn.Linear(config.hidden_size, self.embed_dim) self.key_global = nn.Linear(config.hidden_size, self.embed_dim) self.value_global = nn.Linear(config.hidden_size, self.embed_dim) self.dropout = config.attention_probs_dropout_prob self.layer_id = layer_id attention_window = config.attention_window[self.layer_id] assert ( attention_window % 2 == 0 ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}" assert ( attention_window > 0 ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}" self.one_sided_attention_window_size = attention_window // 2 @staticmethod def _skew(x, direction): """Convert diagonals into columns (or columns into diagonals depending on `direction`""" x_padded = F.pad(x, direction) # padding value is not important because it will be overwritten x_padded = x_padded.view(*x_padded.size()[:-2], x_padded.size(-1), x_padded.size(-2)) return x_padded @staticmethod def _skew2(x): """shift every row 1 step to right converting columns into diagonals""" # X = B x C x M x L B, C, M, L = x.size() x = F.pad(x, (0, M + 1)) # B x C x M x (L+M+1). Padding value is not important because it'll be overwritten x = x.view(B, C, -1) # B x C x ML+MM+M x = x[:, :, :-M] # B x C x ML+MM x = x.view(B, C, M, M + L) # B x C, M x L+M x = x[:, :, :, :-1] return x @staticmethod def _chunk(x, w): """convert into overlapping chunkings. Chunk size = 2w, overlap size = w""" # non-overlapping chunks of size = 2w x = x.view(x.size(0), x.size(1) // (w * 2), w * 2, x.size(2)) # use `as_strided` to make the chunks overlap with an overlap size = w chunk_size = list(x.size()) chunk_size[1] = chunk_size[1] * 2 - 1 chunk_stride = list(x.stride()) chunk_stride[1] = chunk_stride[1] // 2 return x.as_strided(size=chunk_size, stride=chunk_stride) def _mask_invalid_locations(self, input_tensor, w) -> torch.Tensor: affected_seqlen = w beginning_mask_2d = input_tensor.new_ones(w, w + 1).tril().flip(dims=[0]) beginning_mask = beginning_mask_2d[None, :, None, :] ending_mask = beginning_mask.flip(dims=(1, 3)) seqlen = input_tensor.size(1) beginning_input = input_tensor[:, :affected_seqlen, :, : w + 1] beginning_mask = beginning_mask[:, :seqlen].expand(beginning_input.size()) beginning_input.masked_fill_(beginning_mask == 1, -float("inf")) # `== 1` converts to bool or uint8 ending_input = input_tensor[:, -affected_seqlen:, :, -(w + 1) :] ending_mask = ending_mask[:, -seqlen:].expand(ending_input.size()) ending_input.masked_fill_(ending_mask == 1, -float("inf")) # `== 1` converts to bool or uint8 def _sliding_chunks_matmul_qk(self, q: torch.Tensor, k: torch.Tensor, w: int): """Matrix multiplicatio of query x key tensors using with a sliding window attention pattern. This implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an overlap of size w""" batch_size, seqlen, num_heads, head_dim = q.size() assert seqlen % (w * 2) == 0, f"Sequence length should be multiple of {w * 2}. Given {seqlen}" assert q.size() == k.size() chunks_count = seqlen // w - 1 # group batch_size and num_heads dimensions into one, then chunk seqlen into chunks of size w * 2 q = q.transpose(1, 2).reshape(batch_size * num_heads, seqlen, head_dim) k = k.transpose(1, 2).reshape(batch_size * num_heads, seqlen, head_dim) chunk_q = self._chunk(q, w) chunk_k = self._chunk(k, w) # matrix multipication # bcxd: batch_size * num_heads x chunks x 2w x head_dim # bcyd: batch_size * num_heads x chunks x 2w x head_dim # bcxy: batch_size * num_heads x chunks x 2w x 2w chunk_attn = torch.einsum("bcxd,bcyd->bcxy", (chunk_q, chunk_k)) # multiply # convert diagonals into columns diagonal_chunk_attn = self._skew(chunk_attn, direction=(0, 0, 0, 1)) # allocate space for the overall attention matrix where the chunks are compined. The last dimension # has (w * 2 + 1) columns. The first (w) columns are the w lower triangles (attention from a word to # w previous words). The following column is attention score from each word to itself, then # followed by w columns for the upper triangle. diagonal_attn = diagonal_chunk_attn.new_empty((batch_size * num_heads, chunks_count + 1, w, w * 2 + 1)) # copy parts from diagonal_chunk_attn into the compined matrix of attentions # - copying the main diagonal and the upper triangle diagonal_attn[:, :-1, :, w:] = diagonal_chunk_attn[:, :, :w, : w + 1] diagonal_attn[:, -1, :, w:] = diagonal_chunk_attn[:, -1, w:, : w + 1] # - copying the lower triangle diagonal_attn[:, 1:, :, :w] = diagonal_chunk_attn[:, :, -(w + 1) : -1, w + 1 :] diagonal_attn[:, 0, 1:w, 1:w] = diagonal_chunk_attn[:, 0, : w - 1, 1 - w :] # separate batch_size and num_heads dimensions again diagonal_attn = diagonal_attn.view(batch_size, num_heads, seqlen, 2 * w + 1).transpose(2, 1) self._mask_invalid_locations(diagonal_attn, w) return diagonal_attn def _sliding_chunks_matmul_pv(self, prob: torch.Tensor, v: torch.Tensor, w: int): """Same as _sliding_chunks_matmul_qk but for prob and value tensors. It is expecting the same output format from _sliding_chunks_matmul_qk""" batch_size, seqlen, num_heads, head_dim = v.size() assert seqlen % (w * 2) == 0 assert prob.size()[:3] == v.size()[:3] assert prob.size(3) == 2 * w + 1 chunks_count = seqlen // w - 1 # group batch_size and num_heads dimensions into one, then chunk seqlen into chunks of size 2w chunk_prob = prob.transpose(1, 2).reshape(batch_size * num_heads, seqlen // w, w, 2 * w + 1) # group batch_size and num_heads dimensions into one v = v.transpose(1, 2).reshape(batch_size * num_heads, seqlen, head_dim) # pad seqlen with w at the beginning of the sequence and another w at the end padded_v = F.pad(v, (0, 0, w, w), value=-1) # chunk padded_v into chunks of size 3w and an overlap of size w chunk_v_size = (batch_size * num_heads, chunks_count + 1, 3 * w, head_dim) chunk_v_stride = padded_v.stride() chunk_v_stride = chunk_v_stride[0], w * chunk_v_stride[1], chunk_v_stride[1], chunk_v_stride[2] chunk_v = padded_v.as_strided(size=chunk_v_size, stride=chunk_v_stride) skewed_prob = self._skew2(chunk_prob) context = torch.einsum("bcwd,bcdh->bcwh", (skewed_prob, chunk_v)) return context.view(batch_size, num_heads, seqlen, head_dim).transpose(1, 2) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): """ LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`. Padding to `attention_window` happens in LongformerModel.forward to avoid redoing the padding on each layer. The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to -ve: no attention 0: local attention +ve: global attention `encoder_hidden_states` and `encoder_attention_mask` are not supported and should be None """ # TODO: add support for `encoder_hidden_states` and `encoder_attention_mask` assert encoder_hidden_states is None, "`encoder_hidden_states` is not supported and should be None" assert encoder_attention_mask is None, "`encoder_attention_mask` is not supported and shiould be None" if attention_mask is not None: attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1) key_padding_mask = attention_mask < 0 extra_attention_mask = attention_mask > 0 remove_from_windowed_attention_mask = attention_mask != 0 num_extra_indices_per_batch = extra_attention_mask.long().sum(dim=1) max_num_extra_indices_per_batch = num_extra_indices_per_batch.max() if max_num_extra_indices_per_batch <= 0: extra_attention_mask = None else: # To support the case of variable number of global attention in the rows of a batch, # we use the following three selection masks to select global attention embeddings # in a 3d tensor and pad it to `max_num_extra_indices_per_batch` # 1) selecting embeddings that correspond to global attention extra_attention_mask_nonzeros = extra_attention_mask.nonzero(as_tuple=True) zero_to_max_range = torch.arange( 0, max_num_extra_indices_per_batch, device=num_extra_indices_per_batch.device ) # mask indicating which values are actually going to be padding selection_padding_mask = zero_to_max_range < num_extra_indices_per_batch.unsqueeze(dim=-1) # 2) location of the non-padding values in the selected global attention selection_padding_mask_nonzeros = selection_padding_mask.nonzero(as_tuple=True) # 3) location of the padding values in the selected global attention selection_padding_mask_zeros = (selection_padding_mask == 0).nonzero(as_tuple=True) else: remove_from_windowed_attention_mask = None extra_attention_mask = None key_padding_mask = None hidden_states = hidden_states.transpose(0, 1) seqlen, batch_size, embed_dim = hidden_states.size() assert embed_dim == self.embed_dim q = self.query(hidden_states) k = self.key(hidden_states) v = self.value(hidden_states) q /= math.sqrt(self.head_dim) q = q.view(seqlen, batch_size, self.num_heads, self.head_dim).transpose(0, 1) k = k.view(seqlen, batch_size, self.num_heads, self.head_dim).transpose(0, 1) # attn_weights = (batch_size, seqlen, num_heads, window*2+1) attn_weights = self._sliding_chunks_matmul_qk(q, k, self.one_sided_attention_window_size) self._mask_invalid_locations(attn_weights, self.one_sided_attention_window_size) if remove_from_windowed_attention_mask is not None: # This implementation is fast and takes very little memory because num_heads x hidden_size = 1 # from (batch_size x seqlen) to (batch_size x seqlen x num_heads x hidden_size) remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1).unsqueeze( dim=-1 ) # cast to fp32/fp16 then replace 1's with -inf float_mask = remove_from_windowed_attention_mask.type_as(q).masked_fill( remove_from_windowed_attention_mask, -10000.0 ) ones = float_mask.new_ones(size=float_mask.size()) # tensor of ones # diagonal mask with zeros everywhere and -inf inplace of padding d_mask = self._sliding_chunks_matmul_qk(ones, float_mask, self.one_sided_attention_window_size) attn_weights += d_mask assert list(attn_weights.size()) == [ batch_size, seqlen, self.num_heads, self.one_sided_attention_window_size * 2 + 1, ] # the extra attention if extra_attention_mask is not None: selected_k = k.new_zeros(batch_size, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) selected_k[selection_padding_mask_nonzeros] = k[extra_attention_mask_nonzeros] # (batch_size, seqlen, num_heads, max_num_extra_indices_per_batch) selected_attn_weights = torch.einsum("blhd,bshd->blhs", (q, selected_k)) selected_attn_weights[selection_padding_mask_zeros[0], :, :, selection_padding_mask_zeros[1]] = -10000 # concat to attn_weights # (batch_size, seqlen, num_heads, extra attention count + 2*window+1) attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1) attn_weights_fp32 = F.softmax(attn_weights, dim=-1, dtype=torch.float32) # use fp32 for numerical stability attn_weights = attn_weights_fp32.type_as(attn_weights) if key_padding_mask is not None: # softmax sometimes inserts NaN if all positions are masked, replace them with 0 attn_weights = torch.masked_fill(attn_weights, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0) attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) v = v.view(seqlen, batch_size, self.num_heads, self.head_dim).transpose(0, 1) attn = None if extra_attention_mask is not None: selected_attn_probs = attn_probs.narrow(-1, 0, max_num_extra_indices_per_batch) selected_v = v.new_zeros(batch_size, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) selected_v[selection_padding_mask_nonzeros] = v[extra_attention_mask_nonzeros] # use `matmul` because `einsum` crashes sometimes with fp16 # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v)) attn = torch.matmul(selected_attn_probs.transpose(1, 2), selected_v.transpose(1, 2)).transpose(1, 2) attn_probs = attn_probs.narrow( -1, max_num_extra_indices_per_batch, attn_probs.size(-1) - max_num_extra_indices_per_batch ).contiguous() if attn is None: attn = self._sliding_chunks_matmul_pv(attn_probs, v, self.one_sided_attention_window_size) else: attn += self._sliding_chunks_matmul_pv(attn_probs, v, self.one_sided_attention_window_size) assert attn.size() == (batch_size, seqlen, self.num_heads, self.head_dim), "Unexpected size" attn = attn.transpose(0, 1).reshape(seqlen, batch_size, embed_dim).contiguous() # For this case, we'll just recompute the attention for these indices # and overwrite the attn tensor. # TODO: remove the redundant computation if extra_attention_mask is not None: selected_hidden_states = hidden_states.new_zeros(max_num_extra_indices_per_batch, batch_size, embed_dim) selected_hidden_states[selection_padding_mask_nonzeros[::-1]] = hidden_states[ extra_attention_mask_nonzeros[::-1] ] q = self.query_global(selected_hidden_states) k = self.key_global(hidden_states) v = self.value_global(hidden_states) q /= math.sqrt(self.head_dim) q = ( q.contiguous() .view(max_num_extra_indices_per_batch, batch_size * self.num_heads, self.head_dim) .transpose(0, 1) ) # (batch_size * self.num_heads, max_num_extra_indices_per_batch, head_dim) k = ( k.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1) ) # batch_size * self.num_heads, seqlen, head_dim) v = ( v.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1) ) # batch_size * self.num_heads, seqlen, head_dim) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_weights.size()) == [batch_size * self.num_heads, max_num_extra_indices_per_batch, seqlen] attn_weights = attn_weights.view(batch_size, self.num_heads, max_num_extra_indices_per_batch, seqlen) attn_weights[selection_padding_mask_zeros[0], :, selection_padding_mask_zeros[1], :] = -10000.0 if key_padding_mask is not None: attn_weights = attn_weights.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), -10000.0,) attn_weights = attn_weights.view(batch_size * self.num_heads, max_num_extra_indices_per_batch, seqlen) attn_weights_float = F.softmax( attn_weights, dim=-1, dtype=torch.float32 ) # use fp32 for numerical stability attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) selected_attn = torch.bmm(attn_probs, v) assert list(selected_attn.size()) == [ batch_size * self.num_heads, max_num_extra_indices_per_batch, self.head_dim, ] selected_attn_4d = selected_attn.view( batch_size, self.num_heads, max_num_extra_indices_per_batch, self.head_dim ) nonzero_selected_attn = selected_attn_4d[ selection_padding_mask_nonzeros[0], :, selection_padding_mask_nonzeros[1] ] attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view( len(selection_padding_mask_nonzeros[0]), -1 ) context_layer = attn.transpose(0, 1) if self.output_attentions: if extra_attention_mask is not None: # With global attention, return global attention probabilities only # batch_size x num_heads x max_num_global_attention_tokens x sequence_length # which is the attention weights from tokens with global attention to all tokens # It doesn't not return local attention # In case of variable number of global attantion in the rows of a batch, # attn_weights are padded with -10000.0 attention scores attn_weights = attn_weights.view(batch_size, self.num_heads, max_num_extra_indices_per_batch, seqlen) else: # without global attention, return local attention probabilities # batch_size x num_heads x sequence_length x window_size # which is the attention weights of every token attending to its neighbours attn_weights = attn_weights.permute(0, 2, 1, 3) outputs = (context_layer, attn_weights) if self.output_attentions else (context_layer,) return outputs LONGFORMER_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.LongformerConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ LONGFORMER_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.LonmgformerTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to decide the attention given on each token, local attention or global attenion. Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is important for task-specific finetuning because it makes the model more flexible at representing the task. For example, for classification, the token should be given global attention. For QA, all question tokens should also have global attention. Please refer to the Longformer paper https://arxiv.org/abs/2004.05150 for more details. Mask values selected in ``[0, 1]``: ``0`` for local attention (a sliding window attention), ``1`` for global attention (tokens that attend to all other tokens, and all other tokens attend to them). token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Longformer Model outputting raw hidden-states without any specific head on top.", LONGFORMER_START_DOCSTRING, ) class LongformerModel(RobertaModel): """ This class overrides :class:`~transformers1.RobertaModel` to provide the ability to process long sequences following the selfattention approach described in `Longformer: the Long-Document Transformer`_by Iz Beltagy, Matthew E. Peters, and Arman Cohan. Longformer selfattention combines a local (sliding window) and global attention to extend to long documents without the O(n^2) increase in memory and compute. The selfattention module `LongformerSelfAttention` implemented here supports the combination of local and global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and dilated attention are more relevant for autoregressive language modeling than finetuning on downstream tasks. Future release will add support for autoregressive attention, but the support for dilated attention requires a custom CUDA kernel to be memory and compute efficient. .. _`Longformer: the Long-Document Transformer`: https://arxiv.org/abs/2004.05150 """ config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) if isinstance(config.attention_window, int): assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value" assert config.attention_window > 0, "`config.attention_window` has to be positive" config.attention_window = [config.attention_window] * config.num_hidden_layers # one value per layer else: assert len(config.attention_window) == config.num_hidden_layers, ( "`len(config.attention_window)` should equal `config.num_hidden_layers`. " f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}" ) for i, layer in enumerate(self.encoder.layer): # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` layer.attention.self = LongformerSelfAttention(config, layer_id=i) self.init_weights() def _pad_to_window_size( self, input_ids: torch.Tensor, attention_mask: torch.Tensor, token_type_ids: torch.Tensor, position_ids: torch.Tensor, inputs_embeds: torch.Tensor, attention_window: int, pad_token_id: int, ): """A helper function to pad tokens and mask to work with implementation of Longformer selfattention.""" assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}" input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape batch_size, seqlen = input_shape[:2] padding_len = (attention_window - seqlen % attention_window) % attention_window if padding_len > 0: logger.info( "Input ids are automatically padded from {} to {} to be a multiple of `config.attention_window`: {}".format( seqlen, seqlen + padding_len, attention_window ) ) if input_ids is not None: input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) if attention_mask is not None: attention_mask = F.pad( attention_mask, (0, padding_len), value=False ) # no attention on the padding tokens if token_type_ids is not None: token_type_ids = F.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 if position_ids is not None: # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id) if inputs_embeds is not None: input_ids_padding = inputs_embeds.new_full( (batch_size, padding_len), self.config.pad_token_id, dtype=torch.long, ) inputs_embeds_padding = self.embeddings(input_ids_padding) inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, masked_lm_labels=None, ): r""" Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import LongformerModel, LongformerTokenizer model = LongformerModel.from_pretrained('allenai/longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example, # classification: the token # QA: question tokens # LM: potentially on the beginning of sentences and paragraphs sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask) """ # padding attention_window = ( self.config.attention_window if isinstance(self.config.attention_window, int) else max(self.config.attention_window) ) # merge `global_attention_mask` and `attention_mask` if global_attention_mask is not None: # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) # (global_attention_mask + 1) => 1 for local attention, 2 for global attention # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention if attention_mask is not None: attention_mask = attention_mask * (global_attention_mask + 1) else: # simply use `global_attention_mask` as `attention_mask` # if no `attention_mask` is given attention_mask = global_attention_mask + 1 padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds = self._pad_to_window_size( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, attention_window=attention_window, pad_token_id=self.config.pad_token_id, ) # embed output = super().forward( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=None, inputs_embeds=inputs_embeds, encoder_hidden_states=None, encoder_attention_mask=None, ) # undo padding if padding_len > 0: # `output` has the following tensors: sequence_output, pooled_output, (hidden_states), (attentions) # `sequence_output`: unpad because the calling function is expecting a length == input_ids.size(1) # `pooled_output`: independent of the sequence length # `hidden_states`: mainly used for debugging and analysis, so keep the padding # `attentions`: mainly used for debugging and analysis, so keep the padding output = output[0][:, :-padding_len], *output[1:] return output @add_start_docstrings("""Longformer Model with a `language modeling` head on top. """, LONGFORMER_START_DOCSTRING) class LongformerForMaskedLM(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.longformer = LongformerModel(config) self.lm_head = RobertaLMHead(config) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers1 import LongformerForMaskedLM, LongformerTokenizer model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM # check ``LongformerModel.forward`` for more details how to set `attention_mask` loss, prediction_scores = model(input_ids, attention_mask=attention_mask, masked_lm_labels=input_ids) """ outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """Longformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, LONGFORMER_START_DOCSTRING, ) class LongformerForSequenceClassification(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.longformer = LongformerModel(config) self.classifier = LongformerClassificationHead(config) @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.LongformerConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForSequenceClassification import torch tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ if global_attention_mask is None: logger.info("Initializing global attention on CLS token...") global_attention_mask = torch.zeros_like(input_ids) # global attention on cls token global_attention_mask[:, 0] = 1 outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) class LongformerClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, hidden_states, **kwargs): hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) hidden_states = self.dropout(hidden_states) hidden_states = self.dense(hidden_states) hidden_states = torch.tanh(hidden_states) hidden_states = self.dropout(hidden_states) output = self.out_proj(hidden_states) return output @add_start_docstrings( """Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD / TriviaQA (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, LONGFORMER_START_DOCSTRING, ) class LongformerForQuestionAnswering(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.longformer = LongformerModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.LongformerConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForQuestionAnswering import torch tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text, return_tensors="pt") input_ids = encoding["input_ids"] # default is local attention everywhere # the forward method will automatically set global attention on question tokens attention_mask = encoding["attention_mask"] start_scores, end_scores = model(input_ids, attention_mask=attention_mask) all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist()) answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1] answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token """ # set global attention on question tokens if global_attention_mask is None: logger.info("Initializing global attention on question tokens...") # put global attention on all tokens until `config.sep_token_id` is reached global_attention_mask = _compute_global_attention_mask(input_ids, self.config.sep_token_id) outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) @add_start_docstrings( """Longformer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, LONGFORMER_START_DOCSTRING, ) class LongformerForTokenClassification(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.longformer = LongformerModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, global_attention_mask=None, token_type_ids=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.LongformerConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForTokenClassification import torch tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.longformer( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Longformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, LONGFORMER_START_DOCSTRING, ) class LongformerForMultipleChoice(BertPreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" def __init__(self, config): super().__init__(config) self.longformer = LongformerModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, global_attention_mask=None, labels=None, position_ids=None, inputs_embeds=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import LongformerTokenizer, LongformerForMultipleChoice import torch tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096') # context = "The dog is cute" | choice = "the dog" / "the cat" choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")] input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 # global attention is automatically put on "the dog" and "the cat" outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] # set global attention on question tokens if global_attention_mask is None: logger.info("Initializing global attention on multiple choice...") # put global attention on all tokens after `config.sep_token_id` global_attention_mask = torch.stack( [ _compute_global_attention_mask(input_ids[:, i], self.config.sep_token_id, before_sep_token=False) for i in range(num_choices) ], dim=1, ) flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None flat_global_attention_mask = ( global_attention_mask.view(-1, global_attention_mask.size(-1)) if global_attention_mask is not None else None ) outputs = self.longformer( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, global_attention_mask=flat_global_attention_mask, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_marian.py ================================================ # coding=utf-8 # Copyright 2020 Marian Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch MarianMTModel model, ported from the Marian C++ repo.""" from .modeling_bart import BartForConditionalGeneration MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all Marian models at https://huggingface.co/models?search=Helsinki-NLP ] class MarianMTModel(BartForConditionalGeneration): r""" Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Model API is identical to BartForConditionalGeneration. Available models are listed at `Model List `__ Examples:: from transformers1 import MarianTokenizer, MarianMTModel from typing import List src = 'fr' # source language trg = 'en' # target language sample_text = "où est l'arrêt de bus ?" mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' model = MarianMTModel.from_pretrained(mname) tok = MarianTokenizer.from_pretrained(mname) batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference gen = model.generate(**batch) # for forward pass: model(**batch) words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?" """ def prepare_logits_for_generation(self, logits, cur_len, max_length): logits[:, self.config.pad_token_id] = float("-inf") if cur_len == max_length - 1 and self.config.eos_token_id is not None: self._force_token_ids_generation(logits, self.config.eos_token_id) return logits ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_mmbt.py ================================================ # coding=utf-8 # Copyright (c) Facebook, Inc. and its affiliates. # Copyright (c) HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch MMBT model. """ import logging import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .file_utils import add_start_docstrings from .modeling_utils import ModuleUtilsMixin logger = logging.getLogger(__name__) class ModalEmbeddings(nn.Module): """Generic Modal Embeddings which takes in an encoder, and a transformer embedding. """ def __init__(self, config, encoder, embeddings): super().__init__() self.config = config self.encoder = encoder self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size) self.position_embeddings = embeddings.position_embeddings self.token_type_embeddings = embeddings.token_type_embeddings self.word_embeddings = embeddings.word_embeddings self.LayerNorm = embeddings.LayerNorm self.dropout = nn.Dropout(p=config.hidden_dropout_prob) def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None): token_embeddings = self.proj_embeddings(self.encoder(input_modal)) seq_length = token_embeddings.size(1) if start_token is not None: start_token_embeds = self.word_embeddings(start_token) seq_length += 1 token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1) if end_token is not None: end_token_embeds = self.word_embeddings(end_token) seq_length += 1 token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1) if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device) position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length) if token_type_ids is None: token_type_ids = torch.zeros( (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device ) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = token_embeddings + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings MMBT_START_DOCSTRING = r""" MMBT model was proposed in `Supervised Multimodal Bitransformers for Classifying Images and Text`_ by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine. It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and obtain state-of-the-art performance on various multimodal classification benchmark tasks. This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. .. _`Supervised Multimodal Bitransformers for Classifying Images and Text`: https://github.com/facebookresearch/mmbt .. _`torch.nn.Module`: https://pytorch.org/docs/stable/nn.html#module Parameters: config (:class:`~transformers1.MMBTConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. transformer (:class: `~nn.Module`): A text transformer that is used by MMBT. It should have embeddings, encoder, and pooler attributes. encoder (:class: `~nn.Module`): Encoder for the second modality. It should take in a batch of modal inputs and return k, n dimension embeddings. """ MMBT_INPUTS_DOCSTRING = r""" Inputs: **input_modal**: ``torch.FloatTensor`` of shape ``(batch_size, ***)``: The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image Encoder, the shape would be (batch_size, channels, height, width) **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's appended to the end of other modality embeddings. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. **modal_start_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for Classification tasks. **modal_end_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used. **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Segment token indices to indicate different portions of the inputs. **modal_token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``: Segment token indices to indicate different portions of the non-text modality. The embeddings from these tokens will be summed with the respective token embeddings for the non-text modality. **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of positions of each input sequence tokens in the position embeddings. **modal_position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``: Indices of positions of each input sequence tokens in the position embeddings for the non-text modality. **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. **encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``: Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. **encoder_attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ @add_start_docstrings( "The bare MMBT Model outputting raw hidden-states without any specific head on top.", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING, ) class MMBTModel(nn.Module, ModuleUtilsMixin): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` Sequence of hidden-states at the output of the last layer of the model. **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**: (`optional`, returned when ``config.output_attentions=True``) list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. transformer = BertModel.from_pretrained('bert-base-uncased') encoder = ImageEncoder(args) mmbt = MMBTModel(config, transformer, encoder) """ def __init__(self, config, transformer, encoder): super().__init__() self.config = config self.transformer = transformer self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings) def forward( self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None, attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None, modal_position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_txt_shape = input_ids.size() elif inputs_embeds is not None: input_txt_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device modal_embeddings = self.modal_encoder( input_modal, start_token=modal_start_tokens, end_token=modal_end_tokens, position_ids=modal_position_ids, token_type_ids=modal_token_type_ids, ) input_modal_shape = modal_embeddings.size()[:-1] if token_type_ids is None: token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device) txt_embeddings = self.transformer.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1) input_shape = embedding_output.size()[:-1] if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) else: attention_mask = torch.cat( [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1 ) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(input_shape, device=device) else: encoder_attention_mask = torch.cat( [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1 ) extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, self.device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) encoder_outputs = self.transformer.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.transformer.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value @add_start_docstrings( """MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)""", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING, ) class MMBTForClassification(nn.Module): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification (or regression if config.num_labels==1) loss. **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax). **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**: (`optional`, returned when ``config.output_attentions=True``) list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. transformer = BertModel.from_pretrained('bert-base-uncased') encoder = ImageEncoder(args) model = MMBTForClassification(config, transformer, encoder) outputs = model(input_modal, input_ids, labels=labels) loss, logits = outputs[:2] """ def __init__(self, config, transformer, encoder): super().__init__() self.num_labels = config.num_labels self.mmbt = MMBTModel(config, transformer, encoder) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) def forward( self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None, attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None, modal_position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): outputs = self.mmbt( input_modal=input_modal, input_ids=input_ids, modal_start_tokens=modal_start_tokens, modal_end_tokens=modal_end_tokens, attention_mask=attention_mask, token_type_ids=token_type_ids, modal_token_type_ids=modal_token_type_ids, position_ids=position_ids, modal_position_ids=modal_position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_openai.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch OpenAI GPT model.""" import json import logging import math import os import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from .activations import gelu_new, swish from .configuration_openai import OpenAIGPTConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer logger = logging.getLogger(__name__) OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "openai-gpt", # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt ] def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here) """ import re import numpy as np if ".ckpt" in openai_checkpoint_folder_path: openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path) logger.info("Loading weights from {}".format(openai_checkpoint_folder_path)) with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle: names = json.load(names_handle) with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle: shapes = json.load(shapes_handle) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] # This was used when we had a single embedding matrix for positions and tokens # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0) # model init_params[1] init_params = [arr.squeeze() for arr in init_params] try: assert model.tokens_embed.weight.shape == init_params[1].shape assert model.positions_embed.weight.shape == init_params[0].shape except AssertionError as e: e.args += (model.tokens_embed.weight.shape, init_params[1].shape) e.args += (model.positions_embed.weight.shape, init_params[0].shape) raise model.tokens_embed.weight.data = torch.from_numpy(init_params[1]) model.positions_embed.weight.data = torch.from_numpy(init_params[0]) names.pop(0) # Pop position and token embedding arrays init_params.pop(0) init_params.pop(0) for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]): name = name[6:] # skip "model/" assert name[-2:] == ":0" name = name[:-2] name = name.split("/") pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+\d+", m_name): scope_names = re.split(r"(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "g": pointer = getattr(pointer, "weight") elif scope_names[0] == "b": pointer = getattr(pointer, "bias") elif scope_names[0] == "w": pointer = getattr(pointer, "weight") else: pointer = getattr(pointer, scope_names[0]) if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new} class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): super().__init__() n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) self.n_head = config.n_head self.split_size = n_state self.scale = scale self.output_attentions = config.output_attentions self.c_attn = Conv1D(n_state * 3, nx) self.c_proj = Conv1D(n_state, nx) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.n_head, self.split_size // self.n_head) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) # Prune conv1d layers self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) # Update hyper params self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) self.n_head = self.n_head - len(heads) self.pruned_heads = self.pruned_heads.union(heads) def _attn(self, q, k, v, attention_mask=None, head_mask=None): w = torch.matmul(q, k) if self.scale: w = w / math.sqrt(v.size(-1)) # w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights # XD: self.b may be larger than w, so we need to crop it b = self.bias[:, :, : w.size(-2), : w.size(-1)] w = w * b + -1e4 * (1 - b) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = nn.Softmax(dim=-1)(w) w = self.attn_dropout(w) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [torch.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = x.permute(0, 2, 1, 3).contiguous() new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states def split_heads(self, x, k=False): new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states if k: return x.permute(0, 2, 3, 1) else: return x.permute(0, 2, 1, 3) def forward(self, x, attention_mask=None, head_mask=None): x = self.c_attn(x) query, key, value = x.split(self.split_size, dim=2) query = self.split_heads(query) key = self.split_heads(key, k=True) value = self.split_heads(value) attn_outputs = self._attn(query, key, value, attention_mask, head_mask) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a) outputs = [a] + attn_outputs[1:] return outputs # a, (attentions) class MLP(nn.Module): def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.act = ACT_FNS[config.afn] self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, x): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) return self.dropout(h2) class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super().__init__() nx = config.n_embd self.attn = Attention(nx, n_ctx, config, scale) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) def forward(self, x, attention_mask=None, head_mask=None): attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask) a = attn_outputs[0] n = self.ln_1(x + a) m = self.mlp(n) h = self.ln_2(n + m) outputs = [h] + attn_outputs[1:] return outputs class OpenAIGPTPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = OpenAIGPTConfig load_tf_weights = load_tf_weights_in_openai_gpt base_model_prefix = "transformer" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) OPENAI_GPT_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.OpenAIGPTConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ OPENAI_GPT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.OpenAIGPTTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.", OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd) self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.init_weights() def get_input_embeddings(self): return self.tokens_embed def set_input_embeddings(self, new_embeddings): self.tokens_embed = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import OpenAIGPTTokenizer, OpenAIGPTModel import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTModel.from_pretrained('openai-gpt') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if position_ids is None: # Code is different from when we had a single embedding matrice from position and token embeddings device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(input_shape[-1], dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layer) if inputs_embeds is None: inputs_embeds = self.tokens_embed(input_ids) position_embeds = self.positions_embed(position_ids) if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) token_type_embeds = self.tokens_embed(token_type_ids) else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) all_attentions = () all_hidden_states = () for i, block in enumerate(self.h): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = block(hidden_states, attention_mask, head_mask[i]) hidden_states = outputs[0] if self.output_attentions: all_attentions = all_attentions + (outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = (hidden_states.view(*output_shape),) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last hidden state, (all hidden states), (all attentions) @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = OpenAIGPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights() def get_output_embeddings(self): return self.lm_head @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, (all hidden states), (all attentions) @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = OpenAIGPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.init_weights() def get_output_embeddings(self): return self.lm_head @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, lm_labels=None, mc_labels=None, ): r""" mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`) Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): Language modeling loss. mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): Multiple choice classification loss. lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) model.resize_token_embeddings(len(tokenizer)) choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_reformer.py ================================================ # coding=utf-8 # Copyright 2020 The Trax Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch REFORMER model. """ import logging import sys from collections import namedtuple from functools import reduce from operator import mul import numpy as np import torch from torch import nn from torch.autograd.function import Function from torch.nn import CrossEntropyLoss from .activations import gelu, gelu_fast, gelu_new, swish from .configuration_reformer import ReformerConfig from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, apply_chunking_to_forward logger = logging.getLogger(__name__) REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/reformer-crime-and-punishment", "google/reformer-enwik8", # See all Reformer models at https://huggingface.co/models?filter=reformer ] def mish(x): return x * torch.tanh(nn.functional.softplus(x)) ACT2FN = { "gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "gelu_fast": gelu_fast, "mish": mish, } # Define named tuples for nn.Modules here LSHSelfAttentionOutput = namedtuple("LSHSelfAttentionOutput", ["hidden_states", "attention_probs", "buckets"]) LocalSelfAttentionOutput = namedtuple("LocalSelfAttentionOutput", ["hidden_states", "attention_probs"]) AttentionOutput = namedtuple("AttentionOutput", ["hidden_states", "attention_probs", "buckets"]) ReformerOutput = namedtuple("ReformerOutput", ["hidden_states", "attn_output", "attention_probs", "buckets"]) ReformerBackwardOutput = namedtuple( "ReformerBackwardOutput", ["attn_output", "hidden_states", "grad_attn_output", "grad_hidden_states"] ) ReformerEncoderOutput = namedtuple("ReformerEncoderOutput", ["hidden_states", "all_hidden_states", "all_attentions"]) def _get_least_common_mult_chunk_len(config): attn_types = config.attn_layers attn_types_set = set(attn_types) if len(attn_types_set) == 1 and attn_types[0] == "lsh": return config.lsh_attn_chunk_length elif len(attn_types_set) == 1 and attn_types[0] == "local": return config.local_attn_chunk_length elif len(attn_types_set) == 2 and attn_types_set == set(["lsh", "local"]): return np.lcm(config.lsh_attn_chunk_length, config.local_attn_chunk_length) else: raise NotImplementedError( "Only attn layer types 'lsh' and 'local' exist, but `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.".format( config.attn_layers ) ) class AxialPositionEmbeddings(nn.Module): """Constructs axial position embeddings. Useful for very long input sequences to save memory and time. """ def __init__(self, config): super().__init__() self.axial_pos_shape = config.axial_pos_shape self.axial_pos_embds_dim = config.axial_pos_embds_dim self.dropout = config.hidden_dropout_prob self.least_common_mult_chunk_length = _get_least_common_mult_chunk_len(config) self.weights = nn.ParameterList() assert ( sum(self.axial_pos_embds_dim) == config.hidden_size ), "Make sure that config.axial_pos_embds factors: {} sum to config.hidden_size: {}".format( self.axial_pos_embds_dim, config.hidden_size ) # create weights for axis, axial_pos_embd_dim in enumerate(self.axial_pos_embds_dim): # create expanded shapes ax_shape = [1] * len(self.axial_pos_shape) ax_shape[axis] = self.axial_pos_shape[axis] ax_shape = tuple(ax_shape) + (axial_pos_embd_dim,) # create tensor and init self.weights.append(nn.Parameter(torch.ones(ax_shape, dtype=torch.float32))) def forward(self, position_ids): # broadcast weights to correct shape batch_size = position_ids.shape[0] sequence_length = position_ids.shape[1] broadcasted_weights = [ weight.expand((batch_size,) + self.axial_pos_shape + weight.shape[-1:]) for weight in self.weights ] if self.training is True: assert ( reduce(mul, self.axial_pos_shape) == sequence_length ), "If training, make sure that config.axial_pos_shape factors: {} multiply to sequence length. Got prod({}) != sequence_length: {}. You might want to consider padding your sequence length to {} or changing config.axial_pos_shape.".format( self.axial_pos_shape, self.axial_pos_shape, sequence_length, reduce(mul, self.axial_pos_shape) ) if self.dropout > 0: weights = torch.cat(broadcasted_weights, dim=-1) # permute weights so that 2D correctly drops dims 1 and 2 transposed_weights = weights.transpose(2, 1) # drop entire matrix of last two dims (prev dims 1 and 2) dropped_transposed_weights = nn.functional.dropout2d( transposed_weights, p=self.dropout, training=self.training ) dropped_weights = dropped_transposed_weights.transpose(2, 1) position_encodings = torch.reshape(dropped_weights, (batch_size, sequence_length, -1)) else: position_encodings = torch.cat( [torch.reshape(weight, (batch_size, sequence_length, -1)) for weight in broadcasted_weights], dim=-1, ) else: assert ( reduce(mul, self.axial_pos_shape) >= sequence_length ), "Make sure that config.axial_pos_shape factors: {} multiply at least to max(sequence_length, least_common_mult_chunk_length): max({}, {})".format( self.axial_pos_shape, sequence_length, self.least_common_mult_chunk_length, ) # reshape axial encodings and use only until sequence_length position_encodings = torch.cat(broadcasted_weights, dim=-1) position_encodings = position_encodings.view(batch_size, -1, position_encodings.shape[-1])[ :, :sequence_length ] return position_encodings class PositionEmbeddings(nn.Module): """Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`. """ def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob self.embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size) def forward(self, position_ids): position_embeddings = self.embedding(position_ids) position_embeddings = nn.functional.dropout(position_embeddings, p=self.dropout, training=self.training) return position_embeddings class ReformerEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.max_position_embeddings = config.max_position_embeddings self.dropout = config.hidden_dropout_prob self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.position_embeddings = ( AxialPositionEmbeddings(config) if config.axial_pos_embds else PositionEmbeddings(config) ) def forward(self, input_ids=None, position_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() device = input_ids.device else: input_shape = inputs_embeds.size()[:-1] device = inputs_embeds.device seq_length = input_shape[1] if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand(input_shape) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) assert ( position_ids.shape[-1] <= self.max_position_embeddings ), "Sequence Length: {} has to be larger equal than config.max_position_embeddings: {}".format( position_ids.shape[-1], self.max_position_embeddings ) # dropout embeddings = nn.functional.dropout(inputs_embeds, p=self.dropout, training=self.training) # add positional embeddings position_embeddings = self.position_embeddings(position_ids) embeddings = embeddings + position_embeddings return embeddings class EfficientAttentionMixin: """ A few utilities for nn.Modules in Reformer, to be used as a mixin. """ def _look_adjacent(self, vectors, num_chunks_before, num_chunks_after): """ Used to implement attention between consecutive chunks. Args: vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...] num_chunks_before: chunks before current chunk to include in attention num_chunks_after: chunks after current chunk to include in attention Returns: tensor of shape [num_chunks, N * chunk_length, ...], where N = (1 + num_chunks_before + num_chunks_after). """ if num_chunks_before == 0 and num_chunks_after == 0: return vectors slices = [] for i in range(-num_chunks_before, num_chunks_after + 1): if i == 0: slices.append(vectors) else: slices.append(torch.cat([vectors[:, :, i:, ...], vectors[:, :, :i, ...]], dim=2)) return torch.cat(slices, dim=3) def _split_hidden_size_dim(self, x, num_attn_heads, attn_head_size): """ splits hidden_size dim into attn_head_size and num_attn_heads """ new_x_shape = x.size()[:-1] + (num_attn_heads, attn_head_size) x = x.view(*new_x_shape) return x.transpose(2, 1) def _merge_hidden_size_dims(self, x, num_attn_heads, attn_head_size): """ merges attn_head_size dim and num_attn_heads dim into hidden_size """ x = x.permute(0, 2, 1, 3) return torch.reshape(x, (x.size()[0], -1, num_attn_heads * attn_head_size)) def _split_seq_length_dim_to(self, vectors, dim_factor_1, dim_factor_2, num_attn_heads, attn_head_size=None): """ splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims """ batch_size = vectors.shape[0] split_dim_shape = (batch_size, num_attn_heads, dim_factor_1, dim_factor_2) if len(vectors.shape) == 4: return torch.reshape(vectors, split_dim_shape + (attn_head_size,)) elif len(vectors.shape) == 3: return torch.reshape(vectors, split_dim_shape) else: raise ValueError("Input vector rank should be one of [3, 4], but is: {}".format(len(vectors.shape))) class LSHSelfAttention(nn.Module, EfficientAttentionMixin): def __init__(self, config): super().__init__() self.config = config self.chunk_length = config.lsh_attn_chunk_length self.num_hashes = config.num_hashes self.num_buckets = config.num_buckets self.num_chunks_before = config.lsh_num_chunks_before self.num_chunks_after = config.lsh_num_chunks_after self.hash_seed = config.hash_seed self.is_decoder = config.is_decoder self.max_position_embeddings = config.max_position_embeddings self.dropout = config.lsh_attention_probs_dropout_prob self.num_attention_heads = config.num_attention_heads self.attention_head_size = config.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size self.hidden_size = config.hidden_size # projection matrices self.query_key = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False) # save mask value here. Need fp32 and fp16 mask values self.register_buffer("self_mask_value_float16", torch.tensor(-1e3)) self.register_buffer("self_mask_value_float32", torch.tensor(-1e5)) self.register_buffer("mask_value_float16", torch.tensor(-1e4)) self.register_buffer("mask_value_float32", torch.tensor(-1e9)) def forward( self, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_attentions=False, buckets=None, **kwargs ): sequence_length = hidden_states.shape[1] batch_size = hidden_states.shape[0] # num hashes can optionally be overwritten by user num_hashes = num_hashes if num_hashes is not None else self.num_hashes # project hidden_states to query_key and value query_key_vectors = self.query_key(hidden_states) value_vectors = self.value(hidden_states) # free memory del hidden_states query_key_vectors = self._split_hidden_size_dim( query_key_vectors, self.num_attention_heads, self.attention_head_size ) value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size) assert ( query_key_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( query_key_vectors.shape[-1], self.attention_head_size ) assert ( value_vectors.shape[-1] == self.attention_head_size ), "last dim of value_vectors is {} but should be {}.".format( value_vectors.shape[-1], self.attention_head_size ) # set `num_buckets` on the fly, recommended way to do it if self.num_buckets is None: self._set_num_buckets(sequence_length) # use cached buckets for backprop only if buckets is None: # hash query key vectors into buckets buckets = self._hash_vectors(query_key_vectors, num_hashes) assert ( int(buckets.shape[-1]) == num_hashes * sequence_length ), "last dim of buckets is {}, but should be {}".format(buckets.shape[-1], num_hashes * sequence_length) sorted_bucket_idx, undo_sorted_bucket_idx = self._get_sorted_bucket_idx_and_undo_sorted_bucket_idx( sequence_length, buckets, num_hashes ) # make sure bucket idx is not longer then sequence length sorted_bucket_idx = sorted_bucket_idx % sequence_length # cluster query key value vectors according to hashed buckets query_key_vectors = self._gather_by_expansion(query_key_vectors, sorted_bucket_idx, num_hashes) value_vectors = self._gather_by_expansion(value_vectors, sorted_bucket_idx, num_hashes) query_key_vectors = self._split_seq_length_dim_to( query_key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) value_vectors = self._split_seq_length_dim_to( value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) if self.chunk_length is None: assert ( self.num_chunks_before == 0 and self.num_chunks_after == 0 ), "If `config.chunk_length` is `None`, make sure `config.num_chunks_after` and `config.num_chunks_before` are set to 0." # scale key vectors key_vectors = self._len_and_dim_norm(query_key_vectors) # get attention probs out_vectors, logits, attention_probs = self._attend( query_vectors=query_key_vectors, key_vectors=key_vectors, value_vectors=value_vectors, sorted_bucket_idx=sorted_bucket_idx, attention_mask=attention_mask, head_mask=head_mask, ) # free memory del query_key_vectors, key_vectors, value_vectors # sort clusters back to correct ordering out_vectors, logits = ReverseSort.apply( out_vectors, logits, sorted_bucket_idx, undo_sorted_bucket_idx, self.num_hashes ) # sum up all hash rounds if num_hashes > 1: out_vectors = self._split_seq_length_dim_to( out_vectors, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size, ) logits = self._split_seq_length_dim_to( logits, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size, ).unsqueeze(-1) probs_vectors = torch.exp(logits - torch.logsumexp(logits, dim=2, keepdim=True)) out_vectors = torch.sum(out_vectors * probs_vectors, dim=2) # free memory del probs_vectors # free memory del logits assert out_vectors.shape == ( batch_size, self.num_attention_heads, sequence_length, self.attention_head_size, ), "out_vectors have be of shape `[batch_size, config.num_attention_heads, sequence_length, config.attention_head_size]`." out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size) if do_output_attentions is False: attention_probs = () return LSHSelfAttentionOutput(hidden_states=out_vectors, attention_probs=attention_probs, buckets=buckets) def _hash_vectors(self, vectors, num_hashes): batch_size = vectors.shape[0] # See https://arxiv.org/pdf/1509.02897.pdf # We sample a different random rotation for each round of hashing to # decrease the probability of hash misses. if isinstance(self.num_buckets, int): assert ( self.num_buckets % 2 == 0 ), "There should be an even number of bucktes, but `self.num_bucktes`: {}".format(self.num_buckets) rotation_size = self.num_buckets num_buckets = self.num_buckets else: # Factorize the hash if self.num_buckets is a list or tuple rotation_size, num_buckets = 0, 1 for bucket_factor in self.num_buckets: assert bucket_factor % 2 == 0, "The number of buckets should be even, but `num_bucket`: {}".format( bucket_factor ) rotation_size = rotation_size + bucket_factor num_buckets = num_buckets * bucket_factor # remove gradient vectors = vectors.detach() if self.hash_seed is not None: # for determinism torch.manual_seed(self.hash_seed) rotations_shape = (self.num_attention_heads, vectors.shape[-1], num_hashes, rotation_size // 2) # create a random self.attention_head_size x num_hashes x num_buckets/2 random_rotations = torch.randn(rotations_shape, device=vectors.device, dtype=vectors.dtype) # Output dim: Batch_Size x Num_Attn_Heads x Num_Hashes x Seq_Len x Num_Buckets/2 rotated_vectors = torch.einsum("bmtd,mdhr->bmhtr", vectors, random_rotations) if isinstance(self.num_buckets, int) or len(self.num_buckets) == 1: rotated_vectors = torch.cat([rotated_vectors, -rotated_vectors], dim=-1) buckets = torch.argmax(rotated_vectors, dim=-1) else: # Get the buckets for them and combine. buckets, cur_sum, cur_product = None, 0, 1 for bucket_factor in self.num_buckets: rotated_vectors_factor = rotated_vectors[..., cur_sum : cur_sum + (bucket_factor // 2)] cur_sum = cur_sum + bucket_factor // 2 rotated_vectors_factor = torch.cat([rotated_vectors_factor, -rotated_vectors_factor], dim=-1) if buckets is None: buckets = torch.argmax(rotated_vectors_factor, dim=-1) else: buckets = buckets + (cur_product * torch.argmax(rotated_vectors_factor, dim=-1)) cur_product = cur_product * bucket_factor # buckets is now (Batch_size x Num_Attn_Heads x Num_Hashes x Seq_Len). # Next we add offsets so that bucket numbers from different hashing rounds don't overlap. offsets = torch.arange(num_hashes, device=vectors.device) offsets = (offsets * num_buckets).view((1, 1, -1, 1)) # expand to batch size and num attention heads offsets = offsets.expand((batch_size, self.num_attention_heads) + offsets.shape[-2:]) offset_buckets = (buckets + offsets).flatten(start_dim=2, end_dim=3) return offset_buckets def _get_sorted_bucket_idx_and_undo_sorted_bucket_idx(self, sequence_length, buckets, num_hashes): # no gradients are needed with torch.no_grad(): batch_size = buckets.shape[0] # arange and expand orig_indices = torch.arange(num_hashes * sequence_length, device=buckets.device).view(1, 1, -1) orig_indices = orig_indices.expand(batch_size, self.num_attention_heads, orig_indices.shape[-1]) # scale buckets scaled_buckets = sequence_length * buckets + (orig_indices % sequence_length) # remove gradient scaled_buckets = scaled_buckets.detach() # Hash-based sort sorted_bucket_idx = torch.argsort(scaled_buckets, dim=-1) # create simple indices to scatter to, to have undo sort indices = ( torch.arange(sorted_bucket_idx.shape[-1], device=buckets.device) .view(1, 1, -1) .expand(sorted_bucket_idx.shape) ) # get undo sort undo_sorted_bucket_idx = sorted_bucket_idx.new(*sorted_bucket_idx.size()) undo_sorted_bucket_idx.scatter_(-1, sorted_bucket_idx, indices) return sorted_bucket_idx, undo_sorted_bucket_idx def _set_num_buckets(self, sequence_length): # `num_buckets` should be set to 2 * sequence_length // chunk_length as recommended in paper num_buckets_pow_2 = (2 * (sequence_length // self.chunk_length)).bit_length() - 1 # make sure buckets are power of 2 num_buckets = 2 ** num_buckets_pow_2 # factorize `num_buckets` if `num_buckets` becomes too large num_buckets_limit = 2 * max( int((self.max_position_embeddings // self.chunk_length) ** (0.5)), self.chunk_length, ) if num_buckets > num_buckets_limit: num_buckets = [2 ** (num_buckets_pow_2 // 2), 2 ** (num_buckets_pow_2 - num_buckets_pow_2 // 2)] logger.warning("config.num_buckets is not set. Setting config.num_buckets to {}...".format(num_buckets)) # set num buckets in config to be properly saved self.config.num_buckets = num_buckets self.num_buckets = num_buckets def _attend( self, query_vectors, key_vectors, value_vectors, sorted_bucket_idx, attention_mask, head_mask, ): key_vectors = self._look_adjacent(key_vectors, self.num_chunks_before, self.num_chunks_after) value_vectors = self._look_adjacent(value_vectors, self.num_chunks_before, self.num_chunks_after) # get logits and dots query_key_dots = torch.matmul(query_vectors, key_vectors.transpose(-1, -2)) # free memory del query_vectors, key_vectors query_bucket_idx = self._split_seq_length_dim_to( sorted_bucket_idx, -1, self.chunk_length, self.num_attention_heads ) key_value_bucket_idx = self._look_adjacent(query_bucket_idx, self.num_chunks_before, self.num_chunks_after) # get correct mask values depending on precision if query_key_dots.dtype == torch.float16: self_mask_value = self.self_mask_value_float16.half() mask_value = self.mask_value_float16.half() else: self_mask_value = self.self_mask_value_float32 mask_value = self.mask_value_float32 mask = self._compute_attn_mask(query_bucket_idx, key_value_bucket_idx, attention_mask) if mask is not None: query_key_dots = torch.where(mask, query_key_dots, mask_value) # free memory del mask # Self mask is ALWAYS applied. # From the reformer paper (https://arxiv.org/pdf/2001.04451.pdf): # " While attention to the future is not allowed, typical implementations of the # Transformer do allow a position to attend to itself. # Such behavior is undesirable in a shared-QK formulation because the dot-product # of a query vector with itself will almost always be greater than the dot product of a # query vector with a vector at another position. We therefore modify the masking # to forbid a token from attending to itself, except in situations # where a token has no other valid attention targets (e.g. the first token in a sequence) " self_mask = torch.ne(query_bucket_idx.unsqueeze(-1), key_value_bucket_idx.unsqueeze(-2)).to( query_bucket_idx.device ) # apply self_mask query_key_dots = torch.where(self_mask, query_key_dots, self_mask_value) # free memory del self_mask logits = torch.logsumexp(query_key_dots, dim=-1, keepdim=True) # dots shape is `[batch_size, num_attn_heads, num_hashes * seq_len // chunk_length, chunk_length, chunk_length * (1 + num_chunks_before + num_chunks_after)]` attention_probs = torch.exp(query_key_dots - logits) # free memory del query_key_dots # dropout attention_probs = nn.functional.dropout(attention_probs, p=self.dropout, training=self.training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask # attend values out_vectors = torch.matmul(attention_probs, value_vectors) # free memory del value_vectors # merge chunk length logits = logits.flatten(start_dim=2, end_dim=3).squeeze(-1) out_vectors = out_vectors.flatten(start_dim=2, end_dim=3) return out_vectors, logits, attention_probs def _compute_attn_mask(self, query_indices, key_indices, attention_mask): mask = None # Causal mask if self.is_decoder: mask = torch.ge(query_indices.unsqueeze(-1), key_indices.unsqueeze(-2)).to(query_indices.device) # Attention mask: chunk, look up correct mask value from key_value_bucket_idx # IMPORTANT: official trax code does not use a mask for LSH Atttention. Not sure why. if attention_mask is not None: attention_mask = attention_mask.to(torch.uint8)[:, None, None, :] # expand attn_mask to fit with key_value_bucket_idx shape attention_mask = attention_mask.expand(query_indices.shape[:-1] + (-1,)) key_attn_mask = torch.gather(attention_mask, -1, key_indices) query_attn_mask = torch.gather(attention_mask, -1, query_indices) # expand to query_key_dots shape: duplicate along query axis since key sorting is the same for each query position in chunk attn_mask = query_attn_mask.unsqueeze(-1) * key_attn_mask.unsqueeze(-2) # free memory del query_attn_mask, key_attn_mask, attention_mask # multiply by casaul mask if necessary if mask is not None: mask = mask * attn_mask else: mask = attn_mask return mask def _len_and_dim_norm(self, vectors): """ length and attention head size dim normalization """ vectors = self._len_norm(vectors) vectors = vectors * torch.rsqrt( torch.tensor(self.attention_head_size, device=vectors.device, dtype=vectors.dtype) ) return vectors def _len_norm(self, x, epsilon=1e-6): """ length normalization """ variance = torch.mean(x ** 2, -1, keepdim=True) norm_x = x * torch.rsqrt(variance + epsilon) return norm_x def _gather_by_expansion(self, vectors, idxs, num_hashes): """ expand dims of idxs and vectors for all hashes and gather """ expanded_idxs = idxs.unsqueeze(-1).expand(-1, -1, -1, self.attention_head_size) vectors = vectors.repeat(1, 1, num_hashes, 1) return torch.gather(vectors, 2, expanded_idxs) class ReverseSort(Function): """ After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized backward function is used for Reformer, the gradients of the output vectors have to be explicitely sorted here. """ @staticmethod def forward(ctx, out_vectors, logits, sorted_bucket_idx, undo_sorted_bucket_idx, num_hashes): # save sorted_bucket_idx for backprop with torch.no_grad(): ctx.sorted_bucket_idx = sorted_bucket_idx ctx.num_hashes = num_hashes # undo sort to have correct order for next layer expanded_undo_sort_indices = undo_sorted_bucket_idx.unsqueeze(-1).expand(out_vectors.shape) out_vectors = torch.gather(out_vectors, 2, expanded_undo_sort_indices) logits = torch.gather(logits, 2, undo_sorted_bucket_idx) return out_vectors, logits @staticmethod def backward(ctx, grad_out_vectors, grad_logits): # get parameters saved in ctx sorted_bucket_idx = ctx.sorted_bucket_idx num_hashes = ctx.num_hashes # get real gradient shape # shape is BatchSize x NumAttnHeads x ChunkLen * NumHashes grad_logits_shape = grad_logits.shape # shape is BatchSize x NumAttnHeads x ChunkLen * NumHashes x ChunkLen grad_out_vectors_shape = grad_out_vectors.shape # split gradient vectors and sorted bucket idxs by concatenated chunk dimension to gather correct indices # shape is BatchSize x NumAttnHeads x NumHashes x ChunkLen grad_logits = grad_logits.view((grad_logits_shape[:2] + (num_hashes, -1))) # shape is BatchSize x NumAttnHeads x NumHashes x ChunkLen x ChunkLen grad_out_vectors = grad_out_vectors.view( (grad_out_vectors_shape[:2] + (num_hashes, -1) + grad_out_vectors_shape[-1:]) ) # reshape and expand sorted_bucket_idx = torch.reshape(sorted_bucket_idx, (sorted_bucket_idx.shape[:2] + (num_hashes, -1))) expanded_sort_indices = sorted_bucket_idx.unsqueeze(-1).expand(grad_out_vectors.shape) # reverse sort of forward grad_out_vectors = torch.gather(grad_out_vectors, 3, expanded_sort_indices) grad_logits = torch.gather(grad_logits, 3, sorted_bucket_idx) # reshape into correct shape grad_logits = torch.reshape(grad_logits, grad_logits_shape) grad_out_vectors = torch.reshape(grad_out_vectors, grad_out_vectors_shape) # return grad and `None` fillers for last 3 forward args return grad_out_vectors, grad_logits, None, None, None class LocalSelfAttention(nn.Module, EfficientAttentionMixin): def __init__(self, config): super().__init__() self.num_attention_heads = config.num_attention_heads self.chunk_length = config.local_attn_chunk_length self.num_chunks_before = config.local_num_chunks_before self.num_chunks_after = config.local_num_chunks_after self.is_decoder = config.is_decoder self.pad_token_id = config.pad_token_id self.attention_head_size = config.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size self.hidden_size = config.hidden_size # projection matrices self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False) self.dropout = config.local_attention_probs_dropout_prob # save mask value here self.register_buffer("mask_value_float16", torch.tensor(-1e4)) self.register_buffer("mask_value_float32", torch.tensor(-1e9)) def forward(self, hidden_states, attention_mask=None, head_mask=None, do_output_attentions=False, **kwargs): sequence_length = hidden_states.shape[1] batch_size = hidden_states.shape[0] # project hidden_states to query, key and value query_vectors = self.query(hidden_states) key_vectors = self.key(hidden_states) value_vectors = self.value(hidden_states) # split last dim into `config.num_attention_heads` and `config.attention_head_size` query_vectors = self._split_hidden_size_dim(query_vectors, self.num_attention_heads, self.attention_head_size) key_vectors = self._split_hidden_size_dim(key_vectors, self.num_attention_heads, self.attention_head_size) value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size) assert ( query_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( query_vectors.shape[-1], self.attention_head_size ) assert ( key_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( key_vectors.shape[-1], self.attention_head_size ) assert ( value_vectors.shape[-1] == self.attention_head_size ), "last dim of query_key_vectors is {} but should be {}.".format( value_vectors.shape[-1], self.attention_head_size ) if self.chunk_length is None: assert ( self.num_chunks_before == 0 and self.num_chunks_after == 0 ), "If `config.chunk_length` is `None`, make sure `config.num_chunks_after` and `config.num_chunks_before` are set to 0." # normalize key vectors key_vectors = key_vectors / torch.sqrt( torch.tensor(self.attention_head_size, device=key_vectors.device, dtype=key_vectors.dtype) ) # chunk vectors # B x Num_Attn_Head x Seq_Len // chunk_len x chunk_len x attn_head_size query_vectors = self._split_seq_length_dim_to( query_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) key_vectors = self._split_seq_length_dim_to( key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) value_vectors = self._split_seq_length_dim_to( value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size, ) # chunk indices indices = torch.arange(sequence_length, device=query_vectors.device).repeat( batch_size, self.num_attention_heads, 1 ) query_indices = self._split_seq_length_dim_to(indices, -1, self.chunk_length, self.num_attention_heads) key_indices = self._split_seq_length_dim_to(indices, -1, self.chunk_length, self.num_attention_heads) # append chunks before and after key_vectors = self._look_adjacent(key_vectors, self.num_chunks_before, self.num_chunks_after) value_vectors = self._look_adjacent(value_vectors, self.num_chunks_before, self.num_chunks_after) key_indices = self._look_adjacent(key_indices, self.num_chunks_before, self.num_chunks_after) query_key_dots = torch.matmul(query_vectors, key_vectors.transpose(-1, -2)) # free memory del query_vectors, key_vectors mask = self._compute_attn_mask(query_indices, key_indices, attention_mask, query_key_dots.shape) if mask is not None: # get mask tensor depending on half precision or not if query_key_dots.dtype == torch.float16: mask_value = self.mask_value_float16.half() else: mask_value = self.mask_value_float32 query_key_dots = torch.where(mask, query_key_dots, mask_value) # free memory del mask # softmax logits = torch.logsumexp(query_key_dots, dim=-1, keepdim=True) attention_probs = torch.exp(query_key_dots - logits) # free memory del logits # dropout attention_probs = nn.functional.dropout(attention_probs, p=self.dropout, training=self.training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask # attend values out_vectors = torch.matmul(attention_probs, value_vectors) # free memory del value_vectors # merge chunk length out_vectors = out_vectors.flatten(start_dim=2, end_dim=3) assert out_vectors.shape == (batch_size, self.num_attention_heads, sequence_length, self.attention_head_size,) out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size) if do_output_attentions is False: attention_probs = () return LocalSelfAttentionOutput(hidden_states=out_vectors, attention_probs=attention_probs) def _compute_attn_mask(self, query_indices, key_indices, attention_mask, query_key_dots_shape): mask = None # chunk attention mask and look before and after if attention_mask is not None: attention_mask = attention_mask.to(torch.uint8)[:, None, :] attention_mask = self._split_seq_length_dim_to(attention_mask, -1, self.chunk_length, 1) attention_mask_key = self._look_adjacent(attention_mask, self.num_chunks_before, self.num_chunks_after) # Causal mask if self.is_decoder is True: mask = torch.ge(query_indices.unsqueeze(-1), key_indices.unsqueeze(-2)).to(query_indices.device) # Attention mask if attention_mask is not None: # create attn_mask attn_mask = (attention_mask.unsqueeze(-1) * attention_mask_key.unsqueeze(-2)).expand(query_key_dots_shape) # multiply by casaul mask if necessary if mask is not None: mask = mask * attn_mask else: mask = attn_mask return mask class ReformerSelfOutput(nn.Module): def __init__(self, config): super().__init__() all_head_size = config.num_attention_heads * config.attention_head_size self.dropout = config.hidden_dropout_prob self.dense = nn.Linear(all_head_size, config.hidden_size, bias=False) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) return hidden_states class ReformerAttention(nn.Module): def __init__(self, config, layer_id=0): super().__init__() self.layer_id = layer_id self.attn_layers = config.attn_layers self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if len(set(self.attn_layers)) == 1 and self.attn_layers[0] == "lsh": self.self_attention = LSHSelfAttention(config) elif len(set(self.attn_layers)) == 1 and self.attn_layers[0] == "local": self.self_attention = LocalSelfAttention(config) elif len(set(self.attn_layers)) == 2 and set(self.attn_layers) == set(["lsh", "local"]): # get correct attn layers if self.attn_layers[self.layer_id] == "lsh": self.self_attention = LSHSelfAttention(config) else: self.self_attention = LocalSelfAttention(config) else: raise NotImplementedError( "Only attn layer types 'lsh' and 'local' exist, but got `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.".format( self.attn_layers ) ) self.output = ReformerSelfOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_attentions=False, buckets=None, ): hidden_states = self.layer_norm(hidden_states) # use cached buckets for backprob if buckets not None for LSHSelfAttention self_attention_outputs = self.self_attention( hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, num_hashes=num_hashes, do_output_attentions=do_output_attentions, buckets=buckets, ) attention_output = self.output(self_attention_outputs.hidden_states) # add buckets if necessary if hasattr(self_attention_outputs, "buckets"): buckets = self_attention_outputs.buckets else: buckets = None return AttentionOutput( hidden_states=attention_output, attention_probs=self_attention_outputs.attention_probs, buckets=buckets, ) class ReformerFeedForwardDense(nn.Module): def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob if isinstance(config.hidden_act, str): self.act_fn = ACT2FN[config.hidden_act] else: self.act_fn = config.hidden_act self.dense = nn.Linear(config.hidden_size, config.feed_forward_size) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = self.act_fn(hidden_states) return hidden_states class ReformerFeedForwardOutput(nn.Module): def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob self.dense = nn.Linear(config.feed_forward_size, config.hidden_size) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) return hidden_states class ChunkReformerFeedForward(nn.Module): def __init__(self, config): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dense = ReformerFeedForwardDense(config) self.output = ReformerFeedForwardOutput(config) def forward(self, attention_output): return apply_chunking_to_forward( self.chunk_size_feed_forward, self.seq_len_dim, self.forward_chunk, attention_output, ) def forward_chunk(self, hidden_states): hidden_states = self.layer_norm(hidden_states) hidden_states = self.dense(hidden_states) return self.output(hidden_states) class ReformerLayer(nn.Module): def __init__(self, config, layer_id=0): super().__init__() self.attention = ReformerAttention(config, layer_id) # dropout requires to have the same # seed for forward and backward pass self.attention_seed = None self.feed_forward_seed = None self.feed_forward = ChunkReformerFeedForward(config) def _init_attention_seed(self): """ This function sets a new seed for the attention layer to make dropout deterministic for both forward calls: 1 normal forward call and 1 forward call in backward to recalculate activations. """ # randomize seeds if next(self.parameters()).device.type == "cuda": # GPU device_idx = torch.cuda.current_device() self.attention_seed = torch.cuda.default_generators[device_idx].seed() torch.cuda.manual_seed(self.attention_seed) else: # CPU self.attention_seed = int(torch.seed() % sys.maxsize) torch.manual_seed(self.attention_seed) def _init_feed_forward_seed(self): """ This function sets a new seed for the feed forward layer to make dropout deterministic for both forward calls: 1 normal forward call and 1 forward call in backward to recalculate activations. """ # randomize seeds if next(self.parameters()).device.type == "cuda": # GPU device_idx = torch.cuda.current_device() self.feed_forward_seed = torch.cuda.default_generators[device_idx].seed() torch.cuda.manual_seed(self.feed_forward_seed) else: # CPU self.feed_forward_seed = int(torch.seed() % sys.maxsize) torch.manual_seed(self.feed_forward_seed) def forward( self, prev_attn_output, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_attentions=False, ): with torch.no_grad(): # every forward pass we sample a different seed # for dropout and save for forward fn in backward pass # to have correct dropout self._init_attention_seed() attn_outputs = self.attention( hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, num_hashes=num_hashes, do_output_attentions=do_output_attentions, ) attn_output = attn_outputs.hidden_states # Implementation of RevNet (see Fig. 6 in https://towardsdatascience.com/illustrating-the-reformer-393575ac6ba0) # Y_1 = X_1 + f(X_2) attn_output = prev_attn_output + attn_output # free memory del prev_attn_output # every forward pass we sample a different seed # for dropout and save seed for forward fn in backward # to have correct dropout self._init_feed_forward_seed() # Y_2 = X_2 + g(Y_1) hidden_states = hidden_states + self.feed_forward(attn_output) return ReformerOutput( attn_output=attn_output, hidden_states=hidden_states, attention_probs=attn_outputs.attention_probs, buckets=attn_outputs.buckets, ) def backward_pass( self, next_attn_output, hidden_states, grad_attn_output, grad_hidden_states, attention_mask=None, head_mask=None, buckets=None, ): # Implements the backward pass for reversible ResNets. # A good blog post on how this works can be found here: # Implementation of RevNet (see Fig. 6 in https://towardsdatascience.com/illustrating-the-reformer-393575ac6ba0) # This code is heavily inspired by https://github.com/lucidrains/reformer-pytorch/blob/master/reformer_pytorch/reversible.py with torch.enable_grad(): next_attn_output.requires_grad = True # set seed to have correct dropout torch.manual_seed(self.feed_forward_seed) # g(Y_1) res_hidden_states = self.feed_forward(next_attn_output) res_hidden_states.backward(grad_hidden_states, retain_graph=True) with torch.no_grad(): # X_2 = Y_2 - g(Y_1) hidden_states = hidden_states - res_hidden_states del res_hidden_states grad_attn_output = grad_attn_output + next_attn_output.grad next_attn_output.grad = None with torch.enable_grad(): hidden_states.requires_grad = True # set seed to have correct dropout torch.manual_seed(self.attention_seed) # f(X_2) # use cached buckets for backprob if buckets not None for LSHSelfAttention output = self.attention( hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, buckets=buckets, ).hidden_states output.backward(grad_attn_output, retain_graph=True) with torch.no_grad(): # X_1 = Y_1 - f(X_2) attn_output = next_attn_output - output del output, next_attn_output grad_hidden_states = grad_hidden_states + hidden_states.grad hidden_states.grad = None hidden_states = hidden_states.detach() return ReformerBackwardOutput( attn_output=attn_output, hidden_states=hidden_states, grad_attn_output=grad_attn_output, grad_hidden_states=grad_hidden_states, ) class _ReversibleFunction(Function): """ To prevent PyTorch from performing the usual backpropagation, a customized backward function is implemented here. This way it is made sure that no memory expensive activations are saved during the forward pass. This function is heavily inspired by https://github.com/lucidrains/reformer-pytorch/blob/master/reformer_pytorch/reversible.py """ @staticmethod def forward( ctx, hidden_states, layers, attention_mask, head_mask, num_hashes, all_hidden_states, all_attentions, do_output_hidden_states, do_output_attentions, ): all_buckets = () # split duplicated tensor hidden_states, attn_output = torch.chunk(hidden_states, 2, dim=-1) for layer, layer_head_mask in zip(layers, head_mask): if do_output_hidden_states is True: all_hidden_states.append(hidden_states) layer_outputs = layer( prev_attn_output=attn_output, hidden_states=hidden_states, attention_mask=attention_mask, head_mask=layer_head_mask, num_hashes=num_hashes, do_output_attentions=do_output_attentions, ) attn_output = layer_outputs.attn_output hidden_states = layer_outputs.hidden_states all_buckets = all_buckets + (layer_outputs.buckets,) if do_output_attentions: all_attentions.append(layer_outputs.attention_probs) # Add last layer if do_output_hidden_states is True: all_hidden_states.append(hidden_states) # attach params to ctx for backward ctx.save_for_backward(attn_output.detach(), hidden_states.detach()) ctx.layers = layers ctx.all_buckets = all_buckets ctx.head_mask = head_mask ctx.attention_mask = attention_mask # Concatenate 2 RevNet outputs return torch.cat([attn_output, hidden_states], dim=-1) @staticmethod def backward(ctx, grad_hidden_states): grad_attn_output, grad_hidden_states = torch.chunk(grad_hidden_states, 2, dim=-1) # retrieve params from ctx for backward attn_output, hidden_states = ctx.saved_tensors # create tuple output = ReformerBackwardOutput( attn_output=attn_output, hidden_states=hidden_states, grad_attn_output=grad_attn_output, grad_hidden_states=grad_hidden_states, ) # free memory del grad_attn_output, grad_hidden_states, attn_output, hidden_states layers = ctx.layers all_buckets = ctx.all_buckets head_mask = ctx.head_mask attention_mask = ctx.attention_mask for idx, layer in enumerate(layers[::-1]): # pop last buckets from stack buckets = all_buckets[-1] all_buckets = all_buckets[:-1] # backprop output = layer.backward_pass( next_attn_output=output.attn_output, hidden_states=output.hidden_states, grad_attn_output=output.grad_attn_output, grad_hidden_states=output.grad_hidden_states, head_mask=head_mask[len(layers) - idx - 1], attention_mask=attention_mask, buckets=buckets, ) assert all_buckets == (), "buckets have to be empty after backpropagation" grad_hidden_states = torch.cat([output.grad_attn_output, output.grad_hidden_states], dim=-1) # num of return vars has to match num of forward() args # return gradient for hidden_states arg and None for other args return grad_hidden_states, None, None, None, None, None, None, None, None class ReformerEncoder(nn.Module): def __init__(self, config): super().__init__() self.dropout = config.hidden_dropout_prob self.layers = nn.ModuleList([ReformerLayer(config, i) for i in range(config.num_hidden_layers)]) # Reformer is using Rev Nets, thus last layer outputs are concatenated and # Layer Norm is done over 2 * hidden_size self.layer_norm = nn.LayerNorm(2 * config.hidden_size, eps=config.layer_norm_eps) def forward( self, hidden_states, attention_mask=None, head_mask=None, num_hashes=None, do_output_hidden_states=False, do_output_attentions=False, ): # hidden_states and attention lists to be filled if wished all_hidden_states = [] all_attentions = [] # concat same tensor for reversible ResNet hidden_states = torch.cat([hidden_states, hidden_states], dim=-1) hidden_states = _ReversibleFunction.apply( hidden_states, self.layers, attention_mask, head_mask, num_hashes, all_hidden_states, all_attentions, do_output_hidden_states, do_output_attentions, ) # Apply layer norm to concatenated hidden states hidden_states = self.layer_norm(hidden_states) # Apply dropout hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) return ReformerEncoderOutput( hidden_states=hidden_states, all_hidden_states=all_hidden_states, all_attentions=all_attentions ) class ReformerOnlyLMHead(nn.Module): def __init__(self, config): super().__init__() # Reformer is using Rev Nets, thus last layer outputs are concatenated and # Layer Norm is done over 2 * hidden_size self.seq_len_dim = 1 self.chunk_size_lm_head = config.chunk_size_lm_head self.decoder = nn.Linear(2 * config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states) def forward_chunk(self, hidden_states): hidden_states = self.decoder(hidden_states) return hidden_states class ReformerPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = ReformerConfig base_model_prefix = "reformer" @property def dummy_inputs(self): input_ids = torch.tensor(DUMMY_INPUTS) input_mask = torch.tensor(DUMMY_MASK) dummy_inputs = { "input_ids": input_ids, "attention_mask": input_mask, } return dummy_inputs def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, AxialPositionEmbeddings): for weight in module.weights: torch.nn.init.normal_(weight, std=self.config.axial_norm_std) elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() REFORMER_START_DOCSTRING = r""" Reformer was proposed in `Reformer: The Efficient Transformer`_ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. .. _`Reformer: The Efficient Transformer`: https://arxiv.org/abs/2001.04451 This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.ReformerConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ REFORMER_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. During training the input_ids sequence_length has to be a multiple of the relevant model's chunk lengths (lsh's, local's or both). During evaluation, the indices are automatically padded to be a multiple of the chunk length. Indices can be obtained using :class:`transformers1.ReformerTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. num_hashes (:obj:`int`, `optional`, defaults to :obj:`None`): `num_hashes` is the number of hashing rounds that should be performed during bucketing. Setting `num_hashes` overwrites the default `num_hashes` defined in `config.num_hashes`. For more information, see `num_hashes` in :class:`transformers1.ReformerConfig`. """ @add_start_docstrings( "The bare Reformer Model transformer outputting raw hidden-states" "without any specific head on top.", REFORMER_START_DOCSTRING, ) class ReformerModel(ReformerPreTrainedModel): def __init__(self, config): super().__init__(config) self.config = config assert ( self.config.num_hidden_layers > 0 ), "`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local']" self.embeddings = ReformerEmbeddings(config) self.encoder = ReformerEncoder(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, num_hashes=None, do_output_hidden_states=False, do_output_attentions=False, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``do_output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ReformerModel, ReformerTokenizer import torch tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment') model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # TODO(PVP): delete when PR to change output_attentions is made do_output_attentions = self.config.output_attentions do_output_hidden_states = self.config.output_hidden_states if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() # noqa: F841 device = input_ids.device elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] # noqa: F841 device = inputs_embeds.device else: raise ValueError("You have to specify either input_ids or inputs_embeds") assert ( len(input_shape) == 2 ), "`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {}".format(input_shape) # prepare head mask head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers, is_attention_chunked=True) # original sequence length for padding orig_sequence_length = input_shape[-1] # if needs padding least_common_mult_chunk_length = _get_least_common_mult_chunk_len(self.config) must_pad_to_match_chunk_length = input_shape[-1] % least_common_mult_chunk_length != 0 if must_pad_to_match_chunk_length: padding_length = least_common_mult_chunk_length - input_shape[-1] % least_common_mult_chunk_length if self.training is True: raise ValueError( "If training, sequence Length {} has to be a multiple of least common multiple chunk_length {}. Please consider padding the input to a length of {}.".format( input_shape[-1], least_common_mult_chunk_length, input_shape[-1] + padding_length ) ) # pad input input_ids, inputs_embeds, attention_mask, position_ids, input_shape = self._pad_to_mult_of_chunk_length( input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, input_shape=input_shape, padding_length=padding_length, padded_seq_length=least_common_mult_chunk_length, device=device, ) embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds) encoder_outputs = self.encoder( hidden_states=embedding_output, head_mask=head_mask, attention_mask=attention_mask, num_hashes=num_hashes, do_output_hidden_states=do_output_hidden_states, do_output_attentions=do_output_attentions, ) sequence_output = encoder_outputs.hidden_states # if padding was applied if must_pad_to_match_chunk_length: sequence_output = sequence_output[:, :orig_sequence_length] outputs = (sequence_output,) # TODO(PVP): Replace by named tuple after namedtuples are introduced in the library. if do_output_hidden_states is True: outputs = outputs + (encoder_outputs.all_hidden_states,) if do_output_attentions is True: outputs = outputs + (encoder_outputs.all_attentions,) return outputs def _pad_to_mult_of_chunk_length( self, input_ids, inputs_embeds=None, attention_mask=None, position_ids=None, input_shape=None, padding_length=None, padded_seq_length=None, device=None, ): logger.info( "Input ids are automatically padded from {} to {} to be a multiple of `config.chunk_length`: {}".format( input_shape[-1], input_shape[-1] + padding_length, padded_seq_length ) ) padded_input_ids = torch.full( (input_shape[0], padding_length), self.config.pad_token_id, device=device, dtype=torch.long, ) # Extend `attention_mask` if attention_mask is not None: attention_mask = torch.cat( [ attention_mask, torch.zeros(input_shape[0], padding_length, device=device, dtype=attention_mask.dtype,), ], dim=-1, ) else: attention_mask = torch.cat( [ torch.ones(input_shape, device=device, dtype=torch.uint8), torch.zeros((input_shape[0], padding_length), device=device, dtype=torch.uint8), ], dim=-1, ) # Extend `input_ids` with padding to match least common multiple chunk_length if input_ids is not None: input_ids = torch.cat([input_ids, padded_input_ids], dim=-1) input_shape = input_ids.size() # Pad position ids if given if position_ids is not None: padded_position_ids = torch.arange(input_shape[-1], padded_seq_length, dtype=torch.long, device=device) padded_position_ids = position_ids.unsqueeze(0).expand(input_shape[0], padding_length) position_ids = torch.cat([position_ids, padded_position_ids], dim=-1) # Extend `inputs_embeds` with padding to match least common multiple chunk_length if inputs_embeds is not None: padded_inputs_embeds = self.embeddings(padded_input_ids, position_ids) inputs_embeds = torch.cat([inputs_embeds, padded_inputs_embeds], dim=-2) input_shape = inputs_embeds.size() return input_ids, inputs_embeds, attention_mask, position_ids, input_shape @add_start_docstrings("""Reformer Model with a `language modeling` head on top. """, REFORMER_START_DOCSTRING) class ReformerModelWithLMHead(ReformerPreTrainedModel): def __init__(self, config): super().__init__(config) self.reformer = ReformerModel(config) self.lm_head = ReformerOnlyLMHead(config) self.init_weights() def get_output_embeddings(self): return self.lm_head.decoder def tie_weights(self): # word embeddings are not tied in Reformer pass @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) def forward( self, input_ids=None, position_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, num_hashes=None, labels=None, do_output_hidden_states=False, do_output_attentions=False, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): Classification loss (cross entropy). prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``do_output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import ReformerModelWithLMHead, ReformerTokenizer import torch tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment') model = ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ reformer_outputs = self.reformer( input_ids, position_ids=position_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, num_hashes=num_hashes, do_output_hidden_states=do_output_hidden_states, do_output_attentions=do_output_attentions, ) sequence_output = reformer_outputs[0] logits = self.lm_head(sequence_output) outputs = (logits,) + reformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm_loss), lm_logits, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # TODO(PVP): Add smart caching inputs_dict = {"input_ids": input_ids} if "num_hashes" in kwargs: inputs_dict["num_hashes"] = kwargs["num_hashes"] return inputs_dict ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch RoBERTa model. """ import logging import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_roberta import RobertaConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu from .modeling_utils import create_position_ids_from_input_ids logger = logging.getLogger(__name__) ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "roberta-base", "roberta-large", "roberta-large-mnli", "distilroberta-base", "roberta-base-openai-detector", "roberta-large-openai-detector", # See all RoBERTa models at https://huggingface.co/models?filter=roberta ] class RobertaEmbeddings(BertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ def __init__(self, config): super().__init__(config) self.padding_idx = config.pad_token_id self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) self.position_embeddings = nn.Embedding( config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device) else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) return super().forward( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds ) def create_position_ids_from_inputs_embeds(self, inputs_embeds): """ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. :param torch.Tensor inputs_embeds: :return torch.Tensor: """ input_shape = inputs_embeds.size()[:-1] sequence_length = input_shape[1] position_ids = torch.arange( self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device ) return position_ids.unsqueeze(0).expand(input_shape) ROBERTA_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.RobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ROBERTA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.RobertaTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", ROBERTA_START_DOCSTRING, ) class RobertaModel(BertModel): """ This class overrides :class:`~transformers1.BertModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.embeddings = RobertaEmbeddings(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) class RobertaForMaskedLM(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.roberta = RobertaModel(config) self.lm_head = RobertaLMHead(config) self.init_weights() def get_output_embeddings(self): return self.lm_head.decoder @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForMaskedLM import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMaskedLM.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) class RobertaLMHead(nn.Module): """Roberta Head for masked language modeling.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x) return x @add_start_docstrings( """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ROBERTA_START_DOCSTRING, ) class RobertaForSequenceClassification(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) self.classifier = RobertaClassificationHead(config) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForSequenceClassification import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForSequenceClassification.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, ROBERTA_START_DOCSTRING, ) class RobertaForMultipleChoice(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.roberta = RobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, position_ids=None, head_mask=None, inputs_embeds=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForMultipleChoice import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMultipleChoice.from_pretrained('roberta-base') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None outputs = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, head_mask=head_mask, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ROBERTA_START_DOCSTRING, ) class RobertaForTokenClassification(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import RobertaTokenizer, RobertaForTokenClassification import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForTokenClassification.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) class RobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x @add_start_docstrings( """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ROBERTA_START_DOCSTRING, ) class RobertaForQuestionAnswering(BertPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint roberta-large is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. from transformers1 import RobertaTokenizer, RobertaForQuestionAnswering import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForQuestionAnswering.from_pretrained('roberta-base') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_ids = tokenizer.encode(question, text) start_scores, end_scores = model(torch.tensor([input_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_t5.py ================================================ # coding=utf-8 # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch T5 model. """ import copy import logging import math import os import torch import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss from .configuration_t5 import T5Config from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer logger = logging.getLogger(__name__) #################################################### # This dict contrains shortcut names and associated url # for the pretrained weights provided with the models #################################################### T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", # See all T5 models at https://huggingface.co/models?filter=t5 ] #################################################### # This is a conversion method from TF 1.0 to PyTorch # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 #################################################### def load_tf_weights_in_t5(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model. """ try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) tf_weights[name] = array for txt_name in names: name = txt_name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] for n in name ): logger.info("Skipping {}".format("/".join(name))) tf_weights.pop(txt_name, None) continue if "_slot_" in name[-1]: logger.info("Skipping {}".format("/".join(name))) tf_weights.pop(txt_name, None) continue pointer = model array = tf_weights[txt_name] for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] in ["kernel", "scale", "embedding"]: pointer = getattr(pointer, "weight") # elif scope_names[0] == 'scale': # pointer = getattr(pointer, 'weight') # elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta': # pointer = getattr(pointer, 'bias') # elif scope_names[0] == 'squad': # pointer = getattr(pointer, 'classifier') else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if scope_names[0] not in ["kernel", "scale", "embedding"]: pointer = getattr(pointer, "weight") if scope_names[0] != "embedding": logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name)) array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array.astype(np.float32)) tf_weights.pop(txt_name, None) logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) return model #################################################### # PyTorch Models are constructed by sub-classing # - torch.nn.Module for the layers and # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module) #################################################### class T5LayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ Construct a layernorm module in the T5 style No bias and no substraction of mean. """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, x): # layer norm should always be calculated in float32 variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True) x = x / torch.sqrt(variance + self.variance_epsilon) if self.weight.dtype == torch.float16: x = x.to(torch.float16) return self.weight * x class T5DenseReluDense(nn.Module): def __init__(self, config): super().__init__() self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states): h = self.wi(hidden_states) h = F.relu(h) h = self.dropout(h) h = self.wo(h) return h class T5LayerFF(nn.Module): def __init__(self, config): super().__init__() self.DenseReluDense = T5DenseReluDense(config) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states): norm_x = self.layer_norm(hidden_states) y = self.DenseReluDense(norm_x) layer_output = hidden_states + self.dropout(y) return layer_output class T5Attention(nn.Module): def __init__(self, config: T5Config, has_relative_attention_bias=False): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets self.d_model = config.d_model self.d_kv = config.d_kv self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.d_kv # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) if self.has_relative_attention_bias: self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.n_heads, self.d_kv) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.q = prune_linear_layer(self.q, index) self.k = prune_linear_layer(self.k, index) self.v = prune_linear_layer(self.v, index) self.o = prune_linear_layer(self.o, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.inner_dim = self.d_kv * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) @staticmethod def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 Translate relative position to a bucket number for relative attention. The relative position is defined as memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should allow for more graceful generalization to longer sequences than the model has been trained on. Args: relative_position: an int32 Tensor bidirectional: a boolean - whether the attention is bidirectional num_buckets: an integer max_distance: an integer Returns: a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) """ ret = 0 n = -relative_position if bidirectional: num_buckets //= 2 ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets n = torch.abs(n) else: n = torch.max(n, torch.zeros_like(n)) # now n is in the range [0, inf) # half of the buckets are for exact increments in positions max_exact = num_buckets // 2 is_small = n < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance val_if_large = max_exact + ( torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) ).to(torch.long) val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) ret += torch.where(is_small, n, val_if_large) return ret def compute_bias(self, qlen, klen): """ Compute binned relative position bias """ context_position = torch.arange(qlen, dtype=torch.long)[:, None] memory_position = torch.arange(klen, dtype=torch.long)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) rp_bucket = self._relative_position_bucket( relative_position, # shape (qlen, klen) bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, ) rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) return values def forward( self, input, mask=None, kv=None, position_bias=None, past_key_value_state=None, head_mask=None, query_length=None, use_cache=False, ): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) # past_key_value_state[0] is (bs, n_heads, q_len - 1, dim_per_head) bs, qlen, dim = input.size() if past_key_value_state is not None: assert self.is_decoder is True, "Encoder cannot cache past key value states" assert ( len(past_key_value_state) == 2 ), "past_key_value_state should have 2 past states: keys and values. Got {} past states".format( len(past_key_value_state) ) real_qlen = qlen + past_key_value_state[0].shape[2] if query_length is None else query_length else: real_qlen = qlen if kv is None: klen = real_qlen else: klen = kv.size(1) def shape(x): """ projection """ return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2) def unshape(x): """ compute context """ return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim) q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) elif past_key_value_state is None: k = v = kv k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) if past_key_value_state is not None: if kv is None: k_, v_ = past_key_value_state k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) else: k, v = past_key_value_state if self.is_decoder and use_cache is True: present_key_value_state = ((k, v),) else: present_key_value_state = (None,) scores = torch.einsum("bnqd,bnkd->bnqk", q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(real_qlen, klen) # if key and values are already calculated # we want only the last query position bias if past_key_value_state is not None: position_bias = position_bias[:, :, -1:, :] if mask is not None: position_bias = position_bias + mask # (bs, n_heads, qlen, klen) scores += position_bias weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) context = self.o(context) outputs = (context,) + present_key_value_state if self.output_attentions: outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) return outputs class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, ): norm_x = self.layer_norm(hidden_states) attention_output = self.SelfAttention( norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, use_cache=use_cache, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class T5LayerCrossAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, query_length=None, ): norm_x = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, use_cache=use_cache, query_length=query_length, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class T5Block(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) if self.is_decoder: self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias)) self.layer.append(T5LayerFF(config)) def forward( self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, ): if past_key_value_state is not None: assert self.is_decoder, "Only decoder can use `past_key_value_states`" expected_num_past_key_value_states = 2 if encoder_hidden_states is None else 4 error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format( expected_num_past_key_value_states, "2 (past / key) for cross attention" if expected_num_past_key_value_states == 4 else "", len(past_key_value_state), ) assert len(past_key_value_state) == expected_num_past_key_value_states, error_message self_attn_past_key_value_state = past_key_value_state[:2] cross_attn_past_key_value_state = past_key_value_state[2:] else: self_attn_past_key_value_state, cross_attn_past_key_value_state = None, None self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=self_attn_past_key_value_state, use_cache=use_cache, ) hidden_states, present_key_value_state = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights if self.is_decoder and encoder_hidden_states is not None: # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: query_length = present_key_value_state[0].shape[2] else: query_length = None cross_attention_outputs = self.layer[1]( hidden_states, kv=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, head_mask=head_mask, past_key_value_state=cross_attn_past_key_value_state, query_length=query_length, use_cache=use_cache, ) hidden_states = cross_attention_outputs[0] # Combine self attn and cross attn key value states if present_key_value_state is not None: present_key_value_state = present_key_value_state + cross_attention_outputs[1] # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) outputs = (hidden_states,) # Add attentions if we output them outputs = outputs + (present_key_value_state,) + attention_outputs return outputs # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) class T5PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = T5Config load_tf_weights = load_tf_weights_in_t5 base_model_prefix = "transformer" @property def dummy_inputs(self): input_ids = torch.tensor(DUMMY_INPUTS) input_mask = torch.tensor(DUMMY_MASK) dummy_inputs = { "decoder_input_ids": input_ids, "input_ids": input_ids, "decoder_attention_mask": input_mask, } return dummy_inputs def _init_weights(self, module): """ Initialize the weights """ factor = self.config.initializer_factor # Used for testing weights initialization if isinstance(module, T5LayerNorm): module.weight.data.fill_(factor * 1.0) elif isinstance(module, (T5Model, T5ForConditionalGeneration)): # Mesh TensorFlow embeddings initialization # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) elif isinstance(module, T5DenseReluDense): # Mesh TensorFlow FF initialization # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) if hasattr(module.wi, "bias") and module.wi.bias is not None: module.wi.bias.data.zero_() module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) if hasattr(module.wo, "bias") and module.wo.bias is not None: module.wo.bias.data.zero_() elif isinstance(module, T5Attention): # Mesh TensorFlow attention initialization to avoid scaling before softmax # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 d_model = self.config.d_model d_kv = self.config.d_kv n_heads = self.config.num_heads module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5)) module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5)) if module.has_relative_attention_bias: module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) def _shift_right(self, input_ids): decoder_start_token_id = self.config.decoder_start_token_id pad_token_id = self.config.pad_token_id assert ( decoder_start_token_id is not None ), "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information" # shift inputs to the right shifted_input_ids = input_ids.new_zeros(input_ids.shape) shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() shifted_input_ids[..., 0] = decoder_start_token_id assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." # replace possible -100 values in lm_labels by `pad_token_id` shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) assert torch.all(shifted_input_ids >= 0).item(), "Verify that `lm_labels` has only positive values and -100" return shifted_input_ids class T5Stack(T5PreTrainedModel): def __init__(self, config, embed_tokens=None): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embed_tokens = embed_tokens self.is_decoder = config.is_decoder self.block = nn.ModuleList( [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] ) self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) self.init_weights() def get_input_embeddings(self): return self.embed_tokens def get_output_embeddings(self): return self.embed_tokens def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings def forward( self, input_ids=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, inputs_embeds=None, head_mask=None, past_key_value_states=None, use_cache=False, ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: if self.is_decoder: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape if past_key_value_states is not None: assert seq_length == 1, "Input shape is {}, but should be {} when using past_key_value_sates".format( input_shape, (batch_size, 1) ) # required mask seq length can be calculated via length of past # key value states and seq_length = 1 for the last token mask_seq_length = past_key_value_states[0][0].shape[2] + seq_length else: mask_seq_length = seq_length if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = torch.ones( batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long ) # initialize past_key_value_states with `None` if past does not exist if past_key_value_states is None: past_key_value_states = [None] * len(self.block) # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device) if self.is_decoder and encoder_attention_mask is not None: encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) present_key_value_states = () all_hidden_states = () all_attentions = () position_bias = None encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask=extended_attention_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i], past_key_value_state=past_key_value_state, use_cache=use_cache, ) # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states, present_key_value_state = layer_outputs[:2] if i == 0: # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) position_bias = layer_outputs[3 if self.output_attentions else 2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[5 if self.output_attentions else 3] # append next layer key value states present_key_value_states = present_key_value_states + (present_key_value_state,) if self.output_attentions: all_attentions = all_attentions + (layer_outputs[2],) # We keep only self-attention weights for now hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) outputs = outputs + (present_key_value_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (presents,) (all hidden states), (all attentions) T5_START_DOCSTRING = r""" The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting. This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`: https://arxiv.org/abs/1910.10683 .. _`torch.nn.Module`: https://pytorch.org/docs/stable/nn.html#module Parameters: config (:class:`~transformers1.T5Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ T5_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you should be able to pad the inputs on both the right and the left. Indices can be obtained using :class:`transformers1.T5Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. To know more on how to prepare :obj:`input_ids` for pre-training take a look at `T5 Training <./t5.html#training>`_ . attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation. If `decoder_past_key_value_states` is used, optionally only the last `decoder_input_ids` have to be input (see `decoder_past_key_value_states`). To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at `T5 Training <./t5.html#training>`_ . decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up decoding. If `decoder_past_key_value_states` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all `decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): If `use_cache` is True, `decoder_past_key_value_states` are returned and can be used to speed up decoding (see `decoder_past_key_value_states`). inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. If `decoder_past_key_value_states` is used, optionally only the last `decoder_inputs_embeds` have to be input (see `decoder_past_key_value_states`). This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. head_mask: (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ @add_start_docstrings( "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.", T5_START_DOCSTRING, ) class T5Model(T5PreTrainedModel): def __init__(self, config): super().__init__(config) self.shared = nn.Embedding(config.vocab_size, config.d_model) encoder_config = copy.deepcopy(config) self.encoder = T5Stack(encoder_config, self.shared) decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = T5Stack(decoder_config, self.shared) self.init_weights() def get_input_embeddings(self): return self.shared def set_input_embeddings(self, new_embeddings): self.shared = new_embeddings self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_past_key_value_states=None, use_cache=True, inputs_embeds=None, decoder_inputs_embeds=None, head_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import T5Tokenizer, T5Model tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5Model.from_pretrained('t5-small') input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # Encode if needed (training, first prediction pass) if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask ) hidden_states = encoder_outputs[0] # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] return decoder_outputs + encoder_outputs @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): def __init__(self, config): super().__init__(config) self.model_dim = config.d_model self.shared = nn.Embedding(config.vocab_size, config.d_model) encoder_config = copy.deepcopy(config) self.encoder = T5Stack(encoder_config, self.shared) decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = T5Stack(decoder_config, self.shared) self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) self.init_weights() def get_input_embeddings(self): return self.shared def set_input_embeddings(self, new_embeddings): self.shared = new_embeddings self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) def get_output_embeddings(self): return self.lm_head def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_past_key_value_states=None, use_cache=True, lm_labels=None, inputs_embeds=None, decoder_inputs_embeds=None, head_mask=None, ): r""" lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): Classification loss (cross entropy). prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). If `past_key_value_states` is used only the last prediction_scores of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention. Examples:: from transformers1 import T5Tokenizer, T5ForConditionalGeneration tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids) loss, prediction_scores = outputs[:2] tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1 outputs = model.generate(input_ids) """ # Encode if needed (training, first prediction pass) if encoder_outputs is None: # Convert encoder inputs in embeddings if needed encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask ) hidden_states = encoder_outputs[0] if lm_labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right decoder_input_ids = self._shift_right(lm_labels) # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: assert lm_labels is None, "Decoder should not use cached key value states when training." if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) # insert decoder past at right place # to speed up decoding if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] sequence_output = decoder_outputs[0] # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 sequence_output = sequence_output * (self.model_dim ** -0.5) lm_logits = self.lm_head(sequence_output) decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here if lm_labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-100) loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 decoder_outputs = (loss,) + decoder_outputs return decoder_outputs + encoder_outputs def prepare_inputs_for_generation(self, input_ids, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step if len(past) < 2: encoder_outputs, decoder_past_key_value_states = past, None else: encoder_outputs, decoder_past_key_value_states = past[0], past[1] return { "decoder_input_ids": input_ids, "decoder_past_key_value_states": decoder_past_key_value_states, "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "use_cache": use_cache, } def _reorder_cache(self, past, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder if len(past) < 2: logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") return past decoder_past = past[1] past = (past[0],) reordered_decoder_past = () for layer_past_states in decoder_past: # get the correct batch idx from layer past batch dim # batch dim of `past` is at 2nd position reordered_layer_past_states = () for layer_past_state in layer_past_states: # need to set correct `past` for each of the four key / value states reordered_layer_past_states = reordered_layer_past_states + ( layer_past_state.index_select(0, beam_idx), ) assert reordered_layer_past_states[0].shape == layer_past_states[0].shape assert len(reordered_layer_past_states) == len(layer_past_states) reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return past + (reordered_decoder_past,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_albert.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 ALBERT model. """ import logging import tensorflow as tf from .configuration_albert import AlbertConfig from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "albert-base-v1", "albert-large-v1", "albert-xlarge-v1", "albert-xxlarge-v1", "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", # See all ALBERT models at https://huggingface.co/models?filter=albert ] class TFAlbertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.config = config self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.embedding_size, embeddings_initializer=get_initializer(self.config.initializer_range), name="position_embeddings", ) self.token_type_embeddings = tf.keras.layers.Embedding( config.type_vocab_size, config.embedding_size, embeddings_initializer=get_initializer(self.config.initializer_range), name="token_type_embeddings", ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.config.vocab_size, self.config.embedding_size], initializer=get_initializer(self.config.initializer_range), ) super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, embedding_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.config.embedding_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.config.vocab_size]) class TFAlbertSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) self.key = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) self.value = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # scale attention_scores dk = tf.cast(shape_list(key_layer)[-1], tf.float32) attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs, training=training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape( context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class TFAlbertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, input_tensor = inputs hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class TFAlbertAttention(TFBertSelfAttention): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.hidden_size = config.hidden_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): input_tensor, attention_mask, head_mask = inputs batch_size = shape_list(input_tensor)[0] mixed_query_layer = self.query(input_tensor) mixed_key_layer = self.key(input_tensor) mixed_value_layer = self.value(input_tensor) query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # scale attention_scores dk = tf.cast(shape_list(key_layer)[-1], tf.float32) attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs, training=training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape( context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) hidden_states = self_outputs[0] hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) attention_output = self.LayerNorm(hidden_states + input_tensor) # add attentions if we output them outputs = (attention_output,) + self_outputs[1:] return outputs class TFAlbertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.attention = TFAlbertAttention(config, name="attention") self.ffn = tf.keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn" ) if isinstance(config.hidden_act, str): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act self.ffn_output = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output" ) self.full_layer_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="full_layer_layer_norm" ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) ffn_output = self.ffn(attention_outputs[0]) ffn_output = self.activation(ffn_output) ffn_output = self.ffn_output(ffn_output) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0]) # add attentions if we output them outputs = (hidden_states,) + attention_outputs[1:] return outputs class TFAlbertLayerGroup(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.albert_layers = [ TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num) ] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs layer_hidden_states = () layer_attentions = () for layer_index, albert_layer in enumerate(self.albert_layers): layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training) hidden_states = layer_output[0] if self.output_attentions: layer_attentions = layer_attentions + (layer_output[1],) if self.output_hidden_states: layer_hidden_states = layer_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (layer_hidden_states,) if self.output_attentions: outputs = outputs + (layer_attentions,) # last-layer hidden state, (layer hidden states), (layer attentions) return outputs class TFAlbertTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.config = config self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embedding_hidden_mapping_in = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="embedding_hidden_mapping_in", ) self.albert_layer_groups = [ TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups) ] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_attentions = () if self.output_hidden_states: all_hidden_states = (hidden_states,) for i in range(self.config.num_hidden_layers): # Number of layers in a hidden group layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups) # Index of the hidden group group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) layer_group_output = self.albert_layer_groups[group_idx]( [ hidden_states, attention_mask, head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], ], training=training, ) hidden_states = layer_group_output[0] if self.output_attentions: all_attentions = all_attentions + layer_group_output[-1] if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) # last-layer hidden state, (all hidden states), (all attentions) return outputs class TFAlbertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = AlbertConfig base_model_prefix = "albert" class TFAlbertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dense = tf.keras.layers.Dense( config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.decoder_bias = self.add_weight( shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" ) super().build(input_shape) def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.LayerNorm(hidden_states) hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias return hidden_states @keras_serializable class TFAlbertMainLayer(tf.keras.layers.Layer): config_class = AlbertConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.encoder = TFAlbertTransformer(config, name="encoder") self.pooler = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="pooler", ) def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output[:, 0]) # add hidden_states and attentions if they are here outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # sequence_output, pooled_output, (hidden_states), (attentions) return outputs ALBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`: https://arxiv.org/abs/1909.11942 .. _`tf.keras.Model`: https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Args: config (:class:`~transformers1.AlbertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ALBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.AlbertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.", ALBERT_START_DOCSTRING, ) class TFAlbertModel(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, name="albert") @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Albert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertModel tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertModel.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.albert(inputs, **kwargs) return outputs @add_start_docstrings( """Albert Model with two heads on top for pre-training: a `masked language modeling` head and a `sentence order prediction` (classification) head. """, ALBERT_START_DOCSTRING, ) class TFAlbertForPreTraining(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") def get_output_embeddings(self): return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForPreTraining tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForPreTraining.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, sop_scores = outputs[:2] """ outputs = self.albert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False)) outputs = (prediction_scores, sop_scores) + outputs[2:] return outputs class TFAlbertSOPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) def call(self, pooled_output, training: bool): dropout_pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(dropout_pooled_output) return logits @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING) class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, name="albert") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") def get_output_embeddings(self): return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForMaskedLM tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.albert(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False)) # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] return outputs # prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ALBERT_START_DOCSTRING, ) class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForSequenceClassification tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ outputs = self.albert(inputs, **kwargs) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ALBERT_START_DOCSTRING, ) class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.albert = TFAlbertMainLayer(config, name="albert") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.AlbertConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForQuestionAnswering tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2') input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet") start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1 all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1]) """ outputs = self.albert(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions) @add_start_docstrings( """Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, ALBERT_START_DOCSTRING, ) class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertMainLayer(config, name="albert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property def dummy_inputs(self): """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import AlbertTokenizer, TFAlbertForMultipleChoice tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2') example1 = ["This is a context", "Is it a context? Yes"] example2 = ["This is a context", "Is it a context? No"] encoding = tokenizer.batch_encode_plus([example1, example2], return_tensors='tf', truncation_strategy="only_first", pad_to_max_length=True, max_length=128) outputs = model(encoding["input_ids"][None, :]) logits = outputs[0] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): print("isdict(1)") input_ids = inputs.get("input_ids") print(input_ids) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None: num_choices = shape_list(input_ids)[1] seq_length = shape_list(input_ids)[2] else: num_choices = shape_list(inputs_embeds)[1] seq_length = shape_list(inputs_embeds)[2] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, ] outputs = self.albert(flat_inputs, training=training) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # reshaped_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Model class. """ import logging from collections import OrderedDict from .configuration_auto import ( AlbertConfig, AutoConfig, BertConfig, CTRLConfig, DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, T5Config, TransfoXLConfig, XLMConfig, XLNetConfig, ) from .configuration_utils import PretrainedConfig from .modeling_tf_albert import ( TFAlbertForMaskedLM, TFAlbertForMultipleChoice, TFAlbertForPreTraining, TFAlbertForQuestionAnswering, TFAlbertForSequenceClassification, TFAlbertModel, ) from .modeling_tf_bert import ( TFBertForMaskedLM, TFBertForMultipleChoice, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, TFBertForTokenClassification, TFBertModel, ) from .modeling_tf_ctrl import TFCTRLLMHeadModel, TFCTRLModel from .modeling_tf_distilbert import ( TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TFDistilBertModel, ) from .modeling_tf_gpt2 import TFGPT2LMHeadModel, TFGPT2Model from .modeling_tf_openai import TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel from .modeling_tf_roberta import ( TFRobertaForMaskedLM, TFRobertaForQuestionAnswering, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaModel, ) from .modeling_tf_t5 import TFT5ForConditionalGeneration, TFT5Model from .modeling_tf_transfo_xl import TFTransfoXLLMHeadModel, TFTransfoXLModel from .modeling_tf_xlm import ( TFXLMForQuestionAnsweringSimple, TFXLMForSequenceClassification, TFXLMModel, TFXLMWithLMHeadModel, ) from .modeling_tf_xlnet import ( TFXLNetForQuestionAnsweringSimple, TFXLNetForSequenceClassification, TFXLNetForTokenClassification, TFXLNetLMHeadModel, TFXLNetModel, ) logger = logging.getLogger(__name__) TF_MODEL_MAPPING = OrderedDict( [ (T5Config, TFT5Model), (DistilBertConfig, TFDistilBertModel), (AlbertConfig, TFAlbertModel), (RobertaConfig, TFRobertaModel), (BertConfig, TFBertModel), (OpenAIGPTConfig, TFOpenAIGPTModel), (GPT2Config, TFGPT2Model), (TransfoXLConfig, TFTransfoXLModel), (XLNetConfig, TFXLNetModel), (XLMConfig, TFXLMModel), (CTRLConfig, TFCTRLModel), ] ) TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( [ (T5Config, TFT5ForConditionalGeneration), (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForPreTraining), (RobertaConfig, TFRobertaForMaskedLM), (BertConfig, TFBertForPreTraining), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (GPT2Config, TFGPT2LMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel), (XLNetConfig, TFXLNetLMHeadModel), (XLMConfig, TFXLMWithLMHeadModel), (CTRLConfig, TFCTRLLMHeadModel), ] ) TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ (T5Config, TFT5ForConditionalGeneration), (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM), (RobertaConfig, TFRobertaForMaskedLM), (BertConfig, TFBertForMaskedLM), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (GPT2Config, TFGPT2LMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel), (XLNetConfig, TFXLNetLMHeadModel), (XLMConfig, TFXLMWithLMHeadModel), (CTRLConfig, TFCTRLLMHeadModel), ] ) TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, TFDistilBertForSequenceClassification), (AlbertConfig, TFAlbertForSequenceClassification), (RobertaConfig, TFRobertaForSequenceClassification), (BertConfig, TFBertForSequenceClassification), (XLNetConfig, TFXLNetForSequenceClassification), (XLMConfig, TFXLMForSequenceClassification), ] ) TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [(BertConfig, TFBertForMultipleChoice), (AlbertConfig, TFAlbertForMultipleChoice)] ) TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ (DistilBertConfig, TFDistilBertForQuestionAnswering), (AlbertConfig, TFAlbertForQuestionAnswering), (RobertaConfig, TFRobertaForQuestionAnswering), (BertConfig, TFBertForQuestionAnswering), (XLNetConfig, TFXLNetForQuestionAnsweringSimple), (XLMConfig, TFXLMForQuestionAnsweringSimple), ] ) TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ (DistilBertConfig, TFDistilBertForTokenClassification), (RobertaConfig, TFRobertaForTokenClassification), (BertConfig, TFBertForTokenClassification), (XLNetConfig, TFXLNetForTokenClassification), ] ) class TFAutoModel(object): r""" :class:`~transformers1.TFAutoModel` is a generic model class that will be instantiated as one of the base model classes of the library when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5Model (T5 model) - `distilbert`: TFDistilBertModel (DistilBERT model) - `roberta`: TFRobertaModel (RoBERTa model) - `bert`: TFBertModel (Bert model) - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model) - `gpt2`: TFGPT2Model (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLModel (Transformer-XL model) - `xlnet`: TFXLNetModel (XLNet model) - `xlm`: TFXLMModel (XLM model) - `ctrl`: TFCTRLModel (CTRL model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModel is designed to be instantiated " "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModel.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model) - isInstance of `bert` configuration class: TFBertModel (Bert model) - isInstance of `openai-gpt` configuration class: TFOpenAIGPTModel (OpenAI GPT model) - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: TFTransfoXLModel (Transformer-XL model) - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model) - isInstance of `xlm` configuration class: TFXLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the base model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5Model (T5 model) - `distilbert`: TFDistilBertModel (DistilBERT model) - `roberta`: TFRobertaModel (RoBERTa model) - `bert`: TFTFBertModel (Bert model) - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model) - `gpt2`: TFGPT2Model (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLModel (Transformer-XL model) - `xlnet`: TFXLNetModel (XLNet model) - `ctrl`: TFCTRLModel (CTRL model) Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_MAPPING.keys()) ) ) class TFAutoModelForPreTraining(object): r""" :class:`~transformers1.TFAutoModelForPreTraining` is a generic model class that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` class method. This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForPreTraining is designed to be instantiated " "using the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForPreTraining.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config (:class:`~transformers.PretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers1.TFDistilBertModelForMaskedLM` (DistilBERT model) - isInstance of `roberta` configuration class: :class:`~transformers1.TFRobertaModelForMaskedLM` (RoBERTa model) - isInstance of `bert` configuration class: :class:`~transformers1.TFBertForPreTraining` (Bert model) - isInstance of `openai-gpt` configuration class: :class:`~transformers1.TFOpenAIGPTLMHeadModel` (OpenAI GPT model) - isInstance of `gpt2` configuration class: :class:`~transformers1.TFGPT2ModelLMHeadModel` (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: :class:`~transformers1.TFCTRLModelLMHeadModel` (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: :class:`~transformers1.TFTransfoXLLMHeadModel` (Transformer-XL model) - isInstance of `xlnet` configuration class: :class:`~transformers1.TFXLNetLMHeadModel` (XLNet model) - isInstance of `xlm` configuration class: :class:`~transformers1.TFXLMWithLMHeadModel` (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: :class:`~transformers1.TFT5ModelWithLMHead` (T5 model) - `distilbert`: :class:`~transformers1.TFDistilBertForMaskedLM` (DistilBERT model) - `albert`: :class:`~transformers1.TFAlbertForPreTraining` (ALBERT model) - `roberta`: :class:`~transformers1.TFRobertaForMaskedLM` (RoBERTa model) - `bert`: :class:`~transformers1.TFBertForPreTraining` (Bert model) - `openai-gpt`: :class:`~transformers1.TFOpenAIGPTLMHeadModel` (OpenAI GPT model) - `gpt2`: :class:`~transformers1.TFGPT2LMHeadModel` (OpenAI GPT-2 model) - `transfo-xl`: :class:`~transformers1.TFTransfoXLLMHeadModel` (Transformer-XL model) - `xlnet`: :class:`~transformers1.TFXLNetLMHeadModel` (XLNet model) - `xlm`: :class:`~transformers1.TFXLMWithLMHeadModel` (XLM model) - `ctrl`: :class:`~transformers1.TFCTRLLMHeadModel` (Salesforce CTRL model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Args: pretrained_model_name_or_path: Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely received file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForPreTraining.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForPreTraining.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForPreTraining.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForPreTraining.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) ) ) class TFAutoModelWithLMHead(object): r""" :class:`~transformers1.TFAutoModelWithLMHead` is a generic model class that will be instantiated as one of the language modeling model classes of the library when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5ForConditionalGeneration (T5 model) - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - `roberta`: TFRobertaForMaskedLM (RoBERTa model) - `bert`: TFBertForMaskedLM (Bert model) - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model) - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model) - `xlnet`: TFXLNetLMHeadModel (XLNet model) - `xlm`: TFXLMWithLMHeadModel (XLM model) - `ctrl`: TFCTRLLMHeadModel (CTRL model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelWithLMHead is designed to be instantiated " "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelWithLMHead.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model) - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model) - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `xlm` configuration class: XLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the language modeling model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: TFT5ForConditionalGeneration (T5 model) - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - `roberta`: TFRobertaForMaskedLM (RoBERTa model) - `bert`: TFBertForMaskedLM (Bert model) - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model) - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model) - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model) - `xlnet`: TFXLNetLMHeadModel (XLNet model) - `xlm`: TFXLMWithLMHeadModel (XLM model) - `ctrl`: TFCTRLLMHeadModel (CTRL model) Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_WITH_LM_HEAD_MAPPING.keys()) ) ) class TFAutoModelForMultipleChoice: r""" :class:`~transformers1.TFAutoModelForMultipleChoice` is a generic model class that will be instantiated as one of the multiple choice model classes of the library when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `albert`: TFAlbertForMultipleChoice (Albert model) - `bert`: TFBertForMultipleChoice (Bert model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForMultipleChoice is designed to be instantiated " "using the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForMultipleChoice.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `albert` configuration class: AlbertModel (Albert model) - isInstance of `bert` configuration class: BertModel (Bert model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForMulitpleChoice.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the multiple choice model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `albert`: TFRobertaForMultiple (Albert model) - `bert`: TFBertForMultipleChoice (Bert model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelFormultipleChoice.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelFormultipleChoice.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelFormultipleChoice.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelFormultipleChoice.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), ) ) class TFAutoModelForSequenceClassification(object): r""" :class:`~transformers1.TFAutoModelForSequenceClassification` is a generic model class that will be instantiated as one of the sequence classification model classes of the library when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model) - `roberta`: TFRobertaForSequenceClassification (RoBERTa model) - `bert`: TFBertForSequenceClassification (Bert model) - `xlnet`: TFXLNetForSequenceClassification (XLNet model) - `xlm`: TFXLMForSequenceClassification (XLM model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForSequenceClassification is designed to be instantiated " "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForSequenceClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `xlm` configuration class: XLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the sequence classification model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model) - `roberta`: TFRobertaForSequenceClassification (RoBERTa model) - `bert`: TFBertForSequenceClassification (Bert model) - `xlnet`: TFXLNetForSequenceClassification (XLNet model) - `xlm`: TFXLMForSequenceClassification (XLM model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()), ) ) class TFAutoModelForQuestionAnswering(object): r""" :class:`~transformers1.TFAutoModelForQuestionAnswering` is a generic model class that will be instantiated as one of the question answering model classes of the library when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model) - `albert`: TFAlbertForQuestionAnswering (ALBERT model) - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model) - `bert`: TFBertForQuestionAnswering (Bert model) - `xlnet`: TFXLNetForQuestionAnswering (XLNet model) - `xlm`: TFXLMForQuestionAnswering (XLM model) This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): raise EnvironmentError( "TFAutoModelForQuestionAnswering is designed to be instantiated " "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " "`TFAutoModelForQuestionAnswering.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `albert` configuration class: AlbertModel (ALBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `xlm` configuration class: XLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelForQuestionAnswering.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model) - `albert`: TFAlbertForQuestionAnswering (ALBERT model) - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model) - `bert`: TFBertForQuestionAnswering (Bert model) - `xlnet`: TFXLNetForQuestionAnswering (XLNet model) - `xlm`: TFXLMForQuestionAnswering (XLM model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean Set to True if the Checkpoint is a PyTorch checkpoint. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()), ) ) class TFAutoModelForTokenClassification: def __init__(self): raise EnvironmentError( "TFAutoModelForTokenClassification is designed to be instantiated " "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " "`AutoModelForTokenClassification.from_config(config)` methods." ) @classmethod def from_config(cls, config): r""" Instantiates one of the base model classes of the library from a configuration. Note: Loading a model from its configuration file does **not** load the model weights. It only affects the model's configuration. Use :func:`~transformers1.AutoModel.from_pretrained` to load the model weights Args: config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `distilbert` configuration class: DistilBertModel (DistilBert model) - isInstance of `roberta` configuration class: RobteraModel (Roberta model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelForTokenClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class(config) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library from a pre-trained model configuration. The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `bert`: BertForTokenClassification (Bert model) - `xlnet`: XLNetForTokenClassification (XLNet model) - `distilbert`: DistilBertForTokenClassification (DistilBert model) - `roberta`: RobertaForTokenClassification (Roberta model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers1.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = TFAutoModelForTokenClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) for config_class, model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 BERT model. """ import logging import numpy as np import tensorflow as tf from .configuration_bert import BertConfig from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased", "bert-base-chinese", "bert-base-german-cased", "bert-large-uncased-whole-word-masking", "bert-large-cased-whole-word-masking", "bert-large-uncased-whole-word-masking-finetuned-squad", "bert-large-cased-whole-word-masking-finetuned-squad", "bert-base-cased-finetuned-mrpc", "cl-tohoku/bert-base-japanese", "cl-tohoku/bert-base-japanese-whole-word-masking", "cl-tohoku/bert-base-japanese-char", "cl-tohoku/bert-base-japanese-char-whole-word-masking", "TurkuNLP/bert-base-finnish-cased-v1", "TurkuNLP/bert-base-finnish-uncased-v1", "wietsedv/bert-base-dutch-cased", # See all BERT models at https://huggingface.co/models?filter=bert ] def gelu(x): """ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf def gelu_new(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf def swish(x): return x * tf.sigmoid(x) ACT2FN = { "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.layers.Activation(swish), "gelu_new": tf.keras.layers.Activation(gelu_new), } class TFBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size self.initializer_range = config.initializer_range self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.hidden_size, embeddings_initializer=get_initializer(self.initializer_range), name="position_embeddings", ) self.token_type_embeddings = tf.keras.layers.Embedding( config.type_vocab_size, config.hidden_size, embeddings_initializer=get_initializer(self.initializer_range), name="token_type_embeddings", ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range), ) super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, hidden_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.hidden_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.vocab_size]) class TFBertSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) self.key = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" ) self.value = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = tf.matmul( query_layer, key_layer, transpose_b=True ) # (batch size, num_heads, seq_len_q, seq_len_k) dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs, training=training) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape( context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class TFBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, input_tensor = inputs hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class TFBertAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.self_attention = TFBertSelfAttention(config, name="self") self.dense_output = TFBertSelfOutput(config, name="output") def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): input_tensor, attention_mask, head_mask = inputs self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training) attention_output = self.dense_output([self_outputs[0], input_tensor], training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class TFBertIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class TFBertOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, input_tensor = inputs hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class TFBertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.attention = TFBertAttention(config, name="attention") self.intermediate = TFBertIntermediate(config, name="intermediate") self.bert_output = TFBertOutput(config, name="output") def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) attention_output = attention_outputs[0] intermediate_output = self.intermediate(attention_output) layer_output = self.bert_output([intermediate_output, attention_output], training=training) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs class TFBertEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # outputs, (hidden states), (attentions) class TFBertPooler(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) def call(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) return pooled_output class TFBertPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class TFBertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.transform = TFBertPredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states class TFBertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") def call(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class TFBertNSPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.seq_relationship = tf.keras.layers.Dense( 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" ) def call(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) return seq_relationship_score @keras_serializable class TFBertMainLayer(tf.keras.layers.Layer): config_class = BertConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFBertEmbeddings(config, name="embeddings") self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) class TFBertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig base_model_prefix = "bert" BERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ BERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`__ position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class TFBertModel(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertModel tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertModel.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.bert(inputs, **kwargs) return outputs @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class TFBertForPreTraining(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") def get_output_embeddings(self): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForPreTraining tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForPreTraining.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) seq_relationship_score = self.nsp(pooled_output) outputs = (prediction_scores, seq_relationship_score,) + outputs[ 2: ] # add hidden states and attention if they are here return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class TFBertForMaskedLM(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") def get_output_embeddings(self): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForMaskedLM tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class TFBertForNextSentencePrediction(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForNextSentencePrediction tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." next_sentence = "The sky is blue due to the shorter wavelength of blue light." encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf') logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] assert logits[0][0] < logits[0][1] # the next sentence was random """ outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] seq_relationship_score = self.nsp(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here return outputs # seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class TFBertForSequenceClassification(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForSequenceClassification tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class TFBertForMultipleChoice(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property def dummy_inputs(self): """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForMultipleChoice tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased') prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." choice0 = "It is eaten with a fork and a knife." choice1 = "It is eaten while held in the hand." encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='tf', pad_to_max_length=True) # linear classifier on the output is not yet trained outputs = model(encoding['input_ids'][None, :]) logits = outputs[0] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None: num_choices = shape_list(input_ids)[1] seq_length = shape_list(input_ids)[2] else: num_choices = shape_list(inputs_embeds)[1] seq_length = shape_list(inputs_embeds)[2] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, ] outputs = self.bert(flat_inputs, training=training) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class TFBertForTokenClassification(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForTokenClassification tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class TFBertForQuestionAnswering(TFBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.BertConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import BertTokenizer, TFBertForQuestionAnswering tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :]) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1]) assert answer == "a nice puppet" """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_camembert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 CamemBERT model. """ import logging from .configuration_camembert import CamembertConfig from .file_utils import add_start_docstrings from .modeling_tf_roberta import ( TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaModel, ) logger = logging.getLogger(__name__) TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all CamemBERT models at https://huggingface.co/models?filter=camembert ] CAMEMBERT_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.CamembertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", CAMEMBERT_START_DOCSTRING, ) class TFCamembertModel(TFRobertaModel): """ This class overrides :class:`~transformers1.TFRobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, ) class TFCamembertForMaskedLM(TFRobertaForMaskedLM): """ This class overrides :class:`~transformers1.TFRobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, CAMEMBERT_START_DOCSTRING, ) class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification): """ This class overrides :class:`~transformers1.TFRobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig @add_start_docstrings( """CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, CAMEMBERT_START_DOCSTRING, ) class TFCamembertForTokenClassification(TFRobertaForTokenClassification): """ This class overrides :class:`~transformers1.TFRobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = CamembertConfig ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 CTRL model.""" import logging import numpy as np import tensorflow as tf from .configuration_ctrl import CTRLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "ctrl" # See all CTRL models at https://huggingface.co/models?filter=ctrl ] def angle_defn(pos, i, d_model_size): angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size)) return pos * angle_rates def positional_encoding(position, d_model_size): # create the sinusoidal pattern for the positional encoding angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size) sines = np.sin(angle_rads[:, 0::2]) cosines = np.cos(angle_rads[:, 1::2]) # pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1)[np.newaxis, ...], dtype=tf.float32) pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32) return pos_encoding def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): # calculate attention matmul_qk = tf.matmul(q, k, transpose_b=True) dk = tf.cast(shape_list(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) if mask is not None: scaled_attention_logits += mask * -1e4 if attention_mask is not None: # Apply the attention mask scaled_attention_logits = scaled_attention_logits + attention_mask attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # Mask heads if we want to if head_mask is not None: attention_weights = attention_weights * head_mask output = tf.matmul(attention_weights, v) return output, attention_weights class TFMultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): super().__init__(**kwargs) self.output_attentions = output_attentions self.num_heads = num_heads self.d_model_size = d_model_size self.depth = int(d_model_size / self.num_heads) self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq") self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk") self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv") self.dense = tf.keras.layers.Dense(d_model_size, name="dense") def split_into_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, inputs, training=False): v, k, q, mask, layer_past, attention_mask, head_mask, use_cache = inputs batch_size = shape_list(q)[0] q = self.Wq(q) k = self.Wk(k) v = self.Wv(v) q = self.split_into_heads(q, batch_size) k = self.split_into_heads(k, batch_size) v = self.split_into_heads(v, batch_size) if layer_past is not None: past_key, past_value = tf.unstack(layer_past, axis=0) k = tf.concat((past_key, k), axis=-2) v = tf.concat((past_value, v), axis=-2) # to cope with keras serialization # we need to cast `use_cache` to correct bool # if it is a tensor if tf.is_tensor(use_cache): if hasattr(use_cache, "numpy"): use_cache = bool(use_cache.numpy()) else: use_cache = True if use_cache is True: present = tf.stack((k, v), axis=0) else: present = (None,) output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3]) attn = output[1] original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size)) output = self.dense(original_size_attention) outputs = (output, present) if self.output_attentions: outputs = outputs + (attn,) return outputs def point_wise_feed_forward_network(d_model_size, dff, name=""): return tf.keras.Sequential( [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")], name="ffn", ) class TFEncoderLayer(tf.keras.layers.Layer): def __init__( self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs ): super().__init__(**kwargs) self.multi_head_attention = TFMultiHeadAttention( d_model_size, num_heads, output_attentions, name="multi_head_attention" ) self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn") self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) def call(self, inputs, training=False): x, mask, layer_past, attention_mask, head_mask, use_cache = inputs normed = self.layernorm1(x) attn_outputs = self.multi_head_attention( [normed, normed, normed, mask, layer_past, attention_mask, head_mask, use_cache], training=training ) attn_output = attn_outputs[0] attn_output = self.dropout1(attn_output, training=training) out1 = x + attn_output out2 = self.layernorm2(out1) ffn_output = self.ffn(out2) ffn_output = self.dropout2(ffn_output, training=training) out2 = out1 + ffn_output outputs = (out2,) + attn_outputs[1:] return outputs @keras_serializable class TFCTRLMainLayer(tf.keras.layers.Layer): config_class = CTRLConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.d_model_size = config.n_embd self.num_layers = config.n_layer self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size) self.w = TFSharedEmbeddings( config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w" ) self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [ TFEncoderLayer( config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.layer_norm_epsilon, config.output_attentions, name="h_._{}".format(i), ) for i in range(config.n_layer) ] self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") def get_input_embeddings(self): return self.w def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ raise NotImplementedError def call( self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past attention_mask = inputs[2] if len(inputs) > 2 else attention_mask token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds use_cache = inputs[7] if len(inputs) > 7 else use_cache assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs # If using past key value states, only the last tokens # should be given as an input if past is not None: if input_ids is not None: input_ids = input_ids[:, -1:] if inputs_embeds is not None: inputs_embeds = inputs_embeds[:, -1:] if token_type_ids is not None: token_type_ids = token_type_ids[:, -1:] if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = shape_list(past[0][0])[-2] if position_ids is None: position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.tile(position_ids, [input_shape[0], 1]) # Attention mask. if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = tf.cast(attention_mask, tf.float32) attention_mask = (1.0 - attention_mask) * -10000.0 else: attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # head_mask has shape n_layer x batch x n_heads x N x N if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_layers if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_embeds = self.w(token_type_ids, mode="embedding") token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32)) else: token_type_embeds = 0 position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: inputs_embeds = self.w(input_ids, mode="embedding") seq_len = input_shape[-1] mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32)) pos_embeds = tf.gather(self.pos_encoding, position_ids) hidden_states = inputs_embeds + pos_embeds + token_type_embeds hidden_states = self.dropout(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] presents = () all_hidden_states = () all_attentions = [] for i, (h, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = h([hidden_states, mask, layer_past, attention_mask, head_mask[i], use_cache], training=training) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.layernorm(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs class TFCTRLPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = CTRLConfig base_model_prefix = "transformer" CTRL_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.CTRLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ CTRL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only input_ids that do not have their past calculated should be passed as input_ids (see `past`). Indices can be obtained using :class:`transformers1.CTRLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. use_cache (:obj:`bool`): If `use_cache` is True, `past` key value states are returned and can be used to speed up decoding (see `past`). Defaults to `True`. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", CTRL_START_DOCSTRING, ) class TFCTRLModel(TFCTRLPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFCTRLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import CTRLTokenizer, TFCTRLModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = TFCTRLModel.from_pretrained('ctrl') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs class TFCTRLLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, ) class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFCTRLMainLayer(config, name="transformer") self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head") def get_output_embeddings(self): return self.lm_head.input_embeddings def prepare_inputs_for_generation(self, inputs, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: inputs = tf.expand_dims(inputs[:, -1], -1) return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.CTRLConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import CTRLTokenizer, TFCTRLLMHeadModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = TFCTRLLMHeadModel.from_pretrained('ctrl') input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)]) outputs = model(input_ids) loss, logits = outputs[:2] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] return outputs # lm_logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 DistilBERT model """ import logging import math import numpy as np import tensorflow as tf from .configuration_distilbert import DistilBertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "distilbert-base-uncased", "distilbert-base-uncased-distilled-squad", "distilbert-base-cased", "distilbert-base-cased-distilled-squad", "distilbert-base-multilingual-cased", "distilbert-base-uncased-finetuned-sst-2-english", # See all DistilBERT models at https://huggingface.co/models?filter=distilbert ] # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def gelu(x): """ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf def gelu_new(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf class TFEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dim = config.dim self.initializer_range = config.initializer_range self.word_embeddings = TFSharedEmbeddings( config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings" ) # padding_idx=0) self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.dim, embeddings_initializer=get_initializer(config.initializer_range), name="position_embeddings", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range) ) super().build(input_shape) def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, inputs_embeds=inputs_embeds, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, inputs_embeds=None, training=False): """ Parameters ---------- input_ids: tf.Tensor(bs, max_seq_length) The token ids to embed. Outputs ------- embeddings: tf.Tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ if not isinstance(inputs, (tuple, list)): input_ids = inputs position_ids = None else: input_ids, position_ids = inputs if input_ids is not None: seq_length = shape_list(input_ids)[1] else: seq_length = shape_list(inputs_embeds)[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, hidden_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.dim]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.vocab_size]) class TFMultiHeadSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0 self.q_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin" ) self.k_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin" ) self.v_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin" ) self.out_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin" ) self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): """ Parameters ---------- query: tf.Tensor(bs, seq_length, dim) key: tf.Tensor(bs, seq_length, dim) value: tf.Tensor(bs, seq_length, dim) mask: tf.Tensor(bs, seq_length) Outputs ------- weights: tf.Tensor(bs, n_heads, seq_length, seq_length) Attention weights context: tf.Tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ query, key, value, mask, head_mask = inputs bs, q_length, dim = shape_list(query) k_length = shape_list(key)[1] # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) # assert key.size() == value.size() dim_per_head = self.dim // self.n_heads mask_reshape = [bs, 1, 1, k_length] def shape(x): """ separate heads """ return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) def unshape(x): """ group heads """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length) mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length) scores = scores - 1e30 * (1.0 - mask) weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) if self.output_attentions: return (context, weights) else: return (context,) class TFFFN(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dropout = tf.keras.layers.Dropout(config.dropout) self.lin1 = tf.keras.layers.Dense( config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" ) self.lin2 = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" ) assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format( config.activation ) self.activation = ( tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu ) def call(self, input, training=False): x = self.lin1(input) x = self.activation(x) x = self.lin2(x) x = self.dropout(x, training=training) return x class TFTransformerBlock(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim self.hidden_dim = config.hidden_dim self.dropout = tf.keras.layers.Dropout(config.dropout) self.activation = config.activation self.output_attentions = config.output_attentions assert config.dim % config.n_heads == 0 self.attention = TFMultiHeadSelfAttention(config, name="attention") self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm") self.ffn = TFFFN(config, name="ffn") self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") def call(self, inputs, training=False): # removed: src_enc=None, src_len=None """ Parameters ---------- x: tf.Tensor(bs, seq_length, dim) attn_mask: tf.Tensor(bs, seq_length) Outputs ------- sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: tf.Tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ x, attn_mask, head_mask = inputs # Self-Attention sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training) if self.output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples # assert type(sa_output) == tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) if self.output_attentions: output = (sa_weights,) + output return output class TFTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_layers = config.n_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] def call(self, inputs, training=False): """ Parameters ---------- x: tf.Tensor(bs, seq_length, dim) Input sequence embedded. attn_mask: tf.Tensor(bs, seq_length) Attention mask on the sequence. Outputs ------- hidden_state: tf.Tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top) layer all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)] Tuple of length n_layers with the hidden states from each layer. Optional: only if output_hidden_states=True all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)] Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ x, attn_mask, head_mask = inputs all_hidden_states = () all_attentions = () hidden_state = x for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training) hidden_state = layer_outputs[-1] if self.output_attentions: assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1 # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) outputs = (hidden_state,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class TFDistilBertMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.transformer = TFTransformer(config, name="transformer") # Encoder def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): raise NotImplementedError def call(self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.ones(input_shape) # (bs, seq_length) attention_mask = tf.cast(attention_mask, dtype=tf.float32) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class TFDistilBertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = DistilBertConfig base_model_prefix = "distilbert" DISTILBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ DISTILBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", DISTILBERT_START_DOCSTRING, ) class TFDistilBertModel(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertModel tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertModel.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.distilbert(inputs, **kwargs) return outputs class TFDistilBertLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.vocab_size = config.vocab_size self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.vocab_transform = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform" ) self.act = tf.keras.layers.Activation(gelu) self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") def get_output_embeddings(self): return self.vocab_projector.input_embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForMaskedLM tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ distilbert_output = self.distilbert(inputs, **kwargs) hidden_states = distilbert_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) outputs = (prediction_logits,) + distilbert_output[1:] return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.pre_classifier = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), activation="relu", name="pre_classifier", ) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForSequenceClassification tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ distilbert_output = self.distilbert(inputs, **kwargs) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim) outputs = (logits,) + distilbert_output[1:] return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.dropout = tf.keras.layers.Dropout(config.dropout) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForTokenClassification tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.distilbert(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions) @add_start_docstrings( """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, DISTILBERT_START_DOCSTRING, ) class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) assert config.num_labels == 2 self.dropout = tf.keras.layers.Dropout(config.qa_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1,DistilBertConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import DistilBertTokenizer, TFDistilBertForQuestionAnswering tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] """ distilbert_output = self.distilbert(inputs, **kwargs) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False)) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + distilbert_output[1:] return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_electra.py ================================================ import logging import tensorflow as tf from transformers import ElectraConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_utils import get_initializer, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/electra-small-generator", "google/electra-base-generator", "google/electra-large-generator", "google/electra-small-discriminator", "google/electra-base-discriminator", "google/electra-large-discriminator", # See all ELECTRA models at https://huggingface.co/models?filter=electra ] class TFElectraEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.embedding_size = config.embedding_size self.initializer_range = config.initializer_range self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.embedding_size, embeddings_initializer=get_initializer(self.initializer_range), name="position_embeddings", ) self.token_type_embeddings = tf.keras.layers.Embedding( config.type_vocab_size, config.embedding_size, embeddings_initializer=get_initializer(self.initializer_range), name="token_type_embeddings", ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): """Build shared word embedding layer """ with tf.name_scope("word_embeddings"): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.embedding_size], initializer=get_initializer(self.initializer_range), ) super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [batch_size, length, hidden_size] Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.embedding_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.vocab_size]) class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.dense_prediction = tf.keras.layers.Dense(1, name="dense_prediction") self.config = config def call(self, discriminator_hidden_states, training=False): hidden_states = self.dense(discriminator_hidden_states) hidden_states = ACT2FN[self.config.hidden_act](hidden_states) logits = tf.squeeze(self.dense_prediction(hidden_states)) return logits class TFElectraGeneratorPredictions(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") def call(self, generator_hidden_states, training=False): hidden_states = self.dense(generator_hidden_states) hidden_states = ACT2FN["gelu"](hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class TFElectraPreTrainedModel(TFBertPreTrainedModel): config_class = ElectraConfig base_model_prefix = "electra" def get_extended_attention_mask(self, attention_mask, input_shape): if attention_mask is None: attention_mask = tf.fill(input_shape, 1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def get_head_mask(self, head_mask): if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.config.num_hidden_layers return head_mask class TFElectraMainLayer(TFElectraPreTrainedModel): config_class = ElectraConfig def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.embeddings = TFElectraEmbeddings(config, name="embeddings") if config.embedding_size != config.hidden_size: self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project") self.encoder = TFBertEncoder(config, name="encoder") self.config = config def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) head_mask = self.get_head_mask(head_mask) hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states, training=training) hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training) return hidden_states ELECTRA_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.ElectraConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ELECTRA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.ElectraTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to " "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the " "hidden size and embedding size are different." "" "Both the generator and discriminator checkpoints may be loaded into this model.", ELECTRA_START_DOCSTRING, ) class TFElectraModel(TFElectraPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.electra = TFElectraMainLayer(config, name="electra") def get_input_embeddings(self): return self.electra.embeddings @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraModel tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = TFElectraModel.from_pretrained('google/electra-small-discriminator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.electra(inputs, **kwargs) return outputs @add_start_docstrings( """ Electra model with a binary classification head on top as used during pre-training for identifying generated tokens. Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model of the two to have the correct classification head to be used for this model.""", ELECTRA_START_DOCSTRING, ) class TFElectraForPreTraining(TFElectraPreTrainedModel): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.electra = TFElectraMainLayer(config, name="electra") self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") def get_input_embeddings(self): return self.electra.embeddings @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Prediction scores of the head (scores for each token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraForPreTraining tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training ) discriminator_sequence_output = discriminator_hidden_states[0] logits = self.discriminator_predictions(discriminator_sequence_output) output = (logits,) output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) class TFElectraMaskedLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states, training=False): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """ Electra model with a language modeling head on top. Even though both the discriminator and generator may be loaded into this model, the generator is the only model of the two to have been trained for the masked language modeling task.""", ELECTRA_START_DOCSTRING, ) class TFElectraForMaskedLM(TFElectraPreTrainedModel): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.vocab_size = config.vocab_size self.electra = TFElectraMainLayer(config, name="electra") self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions") if isinstance(config.hidden_act, str): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") def get_input_embeddings(self): return self.electra.embeddings def get_output_embeddings(self): return self.generator_lm_head @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraForMaskedLM tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ generator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training ) generator_sequence_output = generator_hidden_states[0] prediction_scores = self.generator_predictions(generator_sequence_output, training=training) prediction_scores = self.generator_lm_head(prediction_scores, training=training) output = (prediction_scores,) output += generator_hidden_states[1:] return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) @add_start_docstrings( """ Electra model with a token classification head on top. Both the discriminator and generator may be loaded into this model.""", ELECTRA_START_DOCSTRING, ) class TFElectraForTokenClassification(TFElectraPreTrainedModel): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.electra = TFElectraMainLayer(config, name="electra") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.ElectraConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import ElectraTokenizer, TFElectraForTokenClassification tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ discriminator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training ) discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) output = (logits,) output += discriminator_hidden_states[1:] return output # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 Flaubert model. """ import logging import random import tensorflow as tf from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings from .modeling_tf_xlm import ( TFXLMForSequenceClassification, TFXLMMainLayer, TFXLMModel, TFXLMWithLMHeadModel, get_masks, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all Flaubert models at https://huggingface.co/models?filter=flaubert ] FLAUBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.FlaubertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ FLAUBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are languages ids which can be obtained from the language names by using two conversion mappings provided in the configuration of the model (only provided for multilingual models). More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). See usage examples detailed in the `multilingual documentation `__. token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`): dictionary with ``tf.Tensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", FLAUBERT_START_DOCSTRING, ) class TFFlaubertModel(TFXLMModel): config_class = FlaubertConfig def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") class TFFlaubertMainLayer(TFXLMMainLayer): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) def call( self, inputs, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, training=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask langs = inputs[2] if len(inputs) > 2 else langs token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids lengths = inputs[5] if len(inputs) > 5 else lengths cache = inputs[6] if len(inputs) > 6 else cache head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) langs = inputs.get("langs", langs) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) lengths = inputs.get("lengths", lengths) cache = inputs.get("cache", cache) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: bs, slen = shape_list(input_ids) elif inputs_embeds is not None: bs, slen = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if lengths is None: if input_ids is not None: lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) else: lengths = tf.convert_to_tensor([slen] * bs, tf.int32) # mask = input_ids != self.pad_index # check inputs # assert shape_list(lengths)[0] == bs tf.debugging.assert_equal(shape_list(lengths)[0], bs) # assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # position_ids if position_ids is None: position_ids = tf.expand_dims(tf.range(slen), axis=0) else: # assert shape_list(position_ids) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: # assert shape_list(langs) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(langs), [bs, slen]) # langs = langs.transpose(0, 1) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layers # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids) if langs is not None and self.use_lang_emb: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = self.dropout(tensor, training=training) tensor = tensor * mask[..., tf.newaxis] # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): # LayerDrop dropout_probability = random.uniform(0, 1) if training and (dropout_probability < self.layerdrop): continue if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention if not self.pre_norm: attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) else: tensor_normalized = self.layer_norm1[i](tensor) attn_outputs = self.attentions[i]( [tensor_normalized, attn_mask, None, cache, head_mask[i]], training=training ) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN if not self.pre_norm: tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) else: tensor_normalized = self.layer_norm2[i](tensor) tensor = tensor + self.ffns[i](tensor_normalized) tensor = tensor * mask[..., tf.newaxis] # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) @add_start_docstrings( """The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, FLAUBERT_START_DOCSTRING, ) class TFFlaubertWithLMHeadModel(TFXLMWithLMHeadModel): config_class = FlaubertConfig def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") @add_start_docstrings( """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, FLAUBERT_START_DOCSTRING, ) class TFFlaubertForSequenceClassification(TFXLMForSequenceClassification): config_class = FlaubertConfig def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 OpenAI GPT-2 model. """ import logging import numpy as np import tensorflow as tf from .configuration_gpt2 import GPT2Config from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( TFConv1D, TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, keras_serializable, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "distilgpt2", # See all GPT-2 models at https://huggingface.co/models?filter=gpt2 ] def gelu(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf class TFAttention(tf.keras.layers.Layer): def __init__(self, nx, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.n_ctx = n_ctx self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): pass @staticmethod def causal_attention_mask(nd, ns, dtype): """1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. """ i = tf.range(nd)[:, None] j = tf.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) def _attn(self, inputs, training=False): q, k, v, attention_mask, head_mask = inputs # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. _, _, nd, ns = shape_list(w) b = self.causal_attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w * b - 1e4 * (1 - b) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = tf.nn.softmax(w, axis=-1) w = self.attn_dropout(w, training=training) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [tf.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = tf.transpose(x, [0, 2, 1, 3]) x_shape = shape_list(x) new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]] return tf.reshape(x, new_x_shape) def split_heads(self, x): x_shape = shape_list(x) new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head] x = tf.reshape(x, new_x_shape) return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) def call(self, inputs, training=False): x, layer_past, attention_mask, head_mask, use_cache = inputs x = self.c_attn(x) query, key, value = tf.split(x, 3, axis=2) query = self.split_heads(query) key = self.split_heads(key) value = self.split_heads(value) if layer_past is not None: past_key, past_value = tf.unstack(layer_past, axis=0) key = tf.concat([past_key, key], axis=-2) value = tf.concat([past_value, value], axis=-2) # to cope with keras serialization # we need to cast `use_cache` to correct bool # if it is a tensor if tf.is_tensor(use_cache): if hasattr(use_cache, "numpy"): use_cache = bool(use_cache.numpy()) else: use_cache = True if use_cache is True: present = tf.stack([key, value], axis=0) else: present = (None,) attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a, training=training) outputs = [a, present] + attn_outputs[1:] return outputs # a, present, (attentions) class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) def call(self, x, training=False): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) h2 = self.dropout(h2, training=training) return h2 class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") self.mlp = TFMLP(4 * nx, config, name="mlp") def call(self, inputs, training=False): x, layer_past, attention_mask, head_mask, use_cache = inputs a = self.ln_1(x) output_attn = self.attn([a, layer_past, attention_mask, head_mask, use_cache], training=training) a = output_attn[0] # output_attn: a, present, (attentions) x = x + a m = self.ln_2(x) m = self.mlp(m, training=training) x = x + m outputs = [x] + output_attn[1:] return outputs # x, present, (attentions) @keras_serializable class TFGPT2MainLayer(tf.keras.layers.Layer): config_class = GPT2Config def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd self.wte = TFSharedEmbeddings( config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" ) self.wpe = tf.keras.layers.Embedding( config.n_positions, config.n_embd, embeddings_initializer=get_initializer(config.initializer_range), name="wpe", ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") def get_input_embeddings(self): return self.wte def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ raise NotImplementedError def call( self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past attention_mask = inputs[2] if len(inputs) > 2 else attention_mask token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds use_cache = inputs[7] if len(inputs) > 7 else use_cache assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = shape_list(past[0][0])[-2] if position_ids is None: position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = tf.cast(attention_mask, tf.float32) attention_mask = (1.0 - attention_mask) * -10000.0 else: attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: inputs_embeds = self.wte(input_ids, mode="embedding") position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_embeds = self.wte(token_type_ids, mode="embedding") else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] presents = () all_attentions = [] all_hidden_states = () for i, (block, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = block([hidden_states, layer_past, attention_mask, head_mask[i], use_cache], training=training) hidden_states, present = outputs[:2] presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.ln_f(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: outputs = outputs + (presents,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs # last hidden state, presents, (all hidden_states), (attentions) class TFGPT2PreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = GPT2Config base_model_prefix = "transformer" GPT2_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.GPT2Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ GPT2_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`): :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary. If `past` is used, only `input_ids` that do not have their past calculated should be passed as `input_ids`. Indices can be obtained using :class:`transformers1.GPT2Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model should not be passed as `input_ids` as they have already been computed. attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.", GPT2_START_DOCSTRING, ) class TFGPT2Model(TFGPT2PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name="transformer") @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import GPT2Tokenizer, TFGPT2Model tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2Model.from_pretrained('gpt2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs @add_start_docstrings( """The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, ) class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name="transformer") def get_output_embeddings(self): return self.transformer.wte def prepare_inputs_for_generation(self, inputs, past, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past: inputs = tf.expand_dims(inputs[:, -1], -1) return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import GPT2Tokenizer, TFGPT2LMHeadModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2LMHeadModel.from_pretrained('gpt2') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_states = transformer_outputs[0] lm_logits = self.transformer.wte(hidden_states, mode="linear") outputs = (lm_logits,) + transformer_outputs[1:] return outputs # lm_logits, presents, (all hidden_states), (attentions) @add_start_docstrings( """The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, GPT2_START_DOCSTRING, ) class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) config.num_labels = 1 self.transformer = TFGPT2MainLayer(config, name="transformer") self.multiple_choice_head = TFSequenceSummary( config, initializer_range=config.initializer_range, name="multiple_choice_head" ) def get_output_embeddings(self): return self.transformer.wte @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) def call( self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, use_cache=True, training=False, ): r""" mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.GPT2Config`) and inputs: lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as `input_ids` as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. import tensorflow as tf from transformers1 import GPT2Tokenizer, TFGPT2DoubleHeadsModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2') # Add a [CLS] to the vocabulary (we should train it also!) # This option is currently not implemented in TF 2.0 raise NotImplementedError tokenizer.add_special_tokens({'cls_token': '[CLS]'}) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] encoded_choices = [tokenizer.encode(s) for s in choices] cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2 mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past attention_mask = inputs[2] if len(inputs) > 2 else attention_mask token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids use_cache = inputs[8] if len(inputs) > 8 else use_cache assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") past = inputs.get("past", past) attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs if input_ids is not None: input_shapes = shape_list(input_ids) else: input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, use_cache, ] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:]) lm_logits = self.transformer.wte(hidden_states, mode="linear") mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_openai.py ================================================ # coding=utf-8 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 OpenAI GPT model.""" import logging import numpy as np import tensorflow as tf from .configuration_openai import OpenAIGPTConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( TFConv1D, TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "openai-gpt", # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt ] def gelu(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf def swish(x): return x * tf.math.sigmoid(x) ACT_FNS = { "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.layers.Activation(swish), } class TFAttention(tf.keras.layers.Layer): def __init__(self, nx, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.n_ctx = n_ctx self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() def prune_heads(self, heads): pass @staticmethod def causal_attention_mask(nd, ns, dtype): """1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. """ i = tf.range(nd)[:, None] j = tf.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) def _attn(self, inputs, training=False): q, k, v, attention_mask, head_mask = inputs # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. _, _, nd, ns = shape_list(w) b = self.causal_attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w * b - 1e4 * (1 - b) if attention_mask is not None: # Apply the attention mask w = w + attention_mask w = tf.nn.softmax(w, axis=-1) w = self.attn_dropout(w, training=training) # Mask heads if we want to if head_mask is not None: w = w * head_mask outputs = [tf.matmul(w, v)] if self.output_attentions: outputs.append(w) return outputs def merge_heads(self, x): x = tf.transpose(x, [0, 2, 1, 3]) x_shape = shape_list(x) new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]] return tf.reshape(x, new_x_shape) def split_heads(self, x): x_shape = shape_list(x) new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head] x = tf.reshape(x, new_x_shape) return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) def call(self, inputs, training=False): x, attention_mask, head_mask = inputs x = self.c_attn(x) query, key, value = tf.split(x, 3, axis=2) query = self.split_heads(query) key = self.split_heads(key) value = self.split_heads(value) attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training) a = attn_outputs[0] a = self.merge_heads(a) a = self.c_proj(a) a = self.resid_dropout(a, training=training) outputs = [a] + attn_outputs[1:] return outputs # a, (attentions) class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) def call(self, x, training=False): h = self.act(self.c_fc(x)) h2 = self.c_proj(h) h2 = self.dropout(h2, training=training) return h2 class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.mlp = TFMLP(4 * nx, config, name="mlp") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") def call(self, inputs, training=False): x, attention_mask, head_mask = inputs output_attn = self.attn([x, attention_mask, head_mask], training=training) a = output_attn[0] # output_attn: a, (attentions) n = self.ln_1(x + a) m = self.mlp(n, training=training) h = self.ln_2(n + m) outputs = [h] + output_attn[1:] return outputs # x, (attentions) class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd self.tokens_embed = TFSharedEmbeddings( config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed" ) self.positions_embed = tf.keras.layers.Embedding( config.n_positions, config.n_embd, embeddings_initializer=get_initializer(config.initializer_range), name="positions_embed", ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] def get_input_embeddings(self): return self.tokens_embed def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} """ raise NotImplementedError def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = shape_list(input_ids) input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if position_ids is None: position_ids = tf.range(input_shape[-1], dtype=tf.int32)[tf.newaxis, :] if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = tf.cast(attention_mask, tf.float32) attention_mask = (1.0 - attention_mask) * -10000.0 else: attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: inputs_embeds = self.tokens_embed(input_ids, mode="embedding") position_embeds = self.positions_embed(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding") else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] all_attentions = [] all_hidden_states = () for i, block in enumerate(self.h): if self.output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = block([hidden_states, attention_mask, head_mask[i]], training=training) hidden_states = outputs[0] if self.output_attentions: all_attentions.append(outputs[1]) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions,) return outputs # last hidden state, (all hidden_states), (attentions) class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = OpenAIGPTConfig base_model_prefix = "transformer" OPENAI_GPT_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.OpenAIGPTConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ OPENAI_GPT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.GPT2Tokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.", OPENAI_GPT_START_DOCSTRING, ) class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import OpenAIGPTTokenizer, TFOpenAIGPTModel tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTModel.from_pretrained('openai-gpt') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, ) class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") def get_output_embeddings(self): return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_states = transformer_outputs[0] lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") outputs = (lm_logits,) + transformer_outputs[1:] return outputs # lm_logits, (all hidden_states), (attentions) @add_start_docstrings( """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). """, OPENAI_GPT_START_DOCSTRING, ) class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) config.num_labels = 1 self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.multiple_choice_head = TFSequenceSummary( config, initializer_range=config.initializer_range, name="multiple_choice_head" ) def get_output_embeddings(self): return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) def call( self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False, ): r""" mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.OpenAIGPTConfig`) and inputs: lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # For example purposes. Not runnable. import tensorflow as tf from transformers1 import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') # Add a [CLS] to the vocabulary (we should train it also!) # This option is currently not implemented in TF 2.0 raise NotImplementedError tokenizer.add_special_tokens({'cls_token': '[CLS]'}) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :] # Batch size 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs if input_ids is not None: input_shapes = shape_list(input_ids) else: input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_inputs = [ flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds, ] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:]) lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] return outputs # lm logits, mc logits, (all hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_pytorch_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch - TF 2.0 general utilities.""" import logging import os import re import numpy logger = logging.getLogger(__name__) def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""): """ Convert a TF 2.0 model variable name in a pytorch model weight name. Conventions for TF2.0 scopes -> PyTorch attribute names conversions: - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) return tuple with: - pytorch model weight name - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other """ tf_name = tf_name.replace(":0", "") # device ids tf_name = re.sub( r"/[^/]*___([^/]*)/", r"/\1/", tf_name ) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) tf_name = tf_name.replace( "_._", "/" ) # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) tf_name = re.sub(r"//+", "/", tf_name) # Remove empty levels at the end tf_name = tf_name.split("/") # Convert from TF2.0 '/' separators to PyTorch '.' separators tf_name = tf_name[1:] # Remove level zero # When should we transpose the weights transpose = bool(tf_name[-1] == "kernel" or "emb_projs" in tf_name or "out_projs" in tf_name) # Convert standard TF2.0 names in PyTorch names if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma": tf_name[-1] = "weight" if tf_name[-1] == "beta": tf_name[-1] = "bias" # Remove prefix if needed tf_name = ".".join(tf_name) if start_prefix_to_remove: tf_name = tf_name.replace(start_prefix_to_remove, "", 1) return tf_name, transpose ##################### # PyTorch => TF 2.0 # ##################### def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False): """ Load pytorch checkpoints in a TF 2.0 model """ try: import tensorflow as tf # noqa: F401 import torch # noqa: F401 except ImportError: logger.error( "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise pt_path = os.path.abspath(pytorch_checkpoint_path) logger.info("Loading PyTorch weights from {}".format(pt_path)) pt_state_dict = torch.load(pt_path, map_location="cpu") logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values()))) return load_pytorch_weights_in_tf2_model( tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys ) def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False): """ Load pytorch checkpoints in a TF 2.0 model """ pt_state_dict = pt_model.state_dict() return load_pytorch_weights_in_tf2_model( tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys ) def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False): """ Load pytorch state_dict in a TF 2.0 model. """ try: import torch # noqa: F401 import tensorflow as tf # noqa: F401 from tensorflow.python.keras import backend as K except ImportError: logger.error( "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise if tf_inputs is None: tf_inputs = tf_model.dummy_inputs if tf_inputs is not None: tf_model(tf_inputs, training=False) # Make sure model is built # Adapt state dict - TODO remove this and update the AWS weights files instead # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] new_keys = [] for key in pt_state_dict.keys(): new_key = None if "gamma" in key: new_key = key.replace("gamma", "weight") if "beta" in key: new_key = key.replace("beta", "bias") if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): pt_state_dict[new_key] = pt_state_dict.pop(old_key) # Make sure we are able to load PyTorch base models as well as derived models (with heads) # TF models always have a prefix, some of PyTorch models (base ones) don't start_prefix_to_remove = "" if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()): start_prefix_to_remove = tf_model.base_model_prefix + "." symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights tf_loaded_numel = 0 weight_value_tuples = [] all_pytorch_weights = set(list(pt_state_dict.keys())) for symbolic_weight in symbolic_weights: sw_name = symbolic_weight.name name, transpose = convert_tf_weight_name_to_pt_weight_name( sw_name, start_prefix_to_remove=start_prefix_to_remove ) # Find associated numpy array in pytorch model state dict if name not in pt_state_dict: if allow_missing_keys: continue raise AttributeError("{} not found in PyTorch model".format(name)) array = pt_state_dict[name].numpy() if transpose: array = numpy.transpose(array) if len(symbolic_weight.shape) < len(array.shape): array = numpy.squeeze(array) elif len(symbolic_weight.shape) > len(array.shape): array = numpy.expand_dims(array, axis=0) try: assert list(symbolic_weight.shape) == list(array.shape) except AssertionError as e: e.args += (symbolic_weight.shape, array.shape) raise e tf_loaded_numel += array.size # logger.warning("Initialize TF weight {}".format(symbolic_weight.name)) weight_value_tuples.append((symbolic_weight, array)) all_pytorch_weights.discard(name) K.batch_set_value(weight_value_tuples) if tf_inputs is not None: tf_model(tf_inputs, training=False) # Make sure restore ops are run logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel)) logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights)) return tf_model ##################### # TF 2.0 => PyTorch # ##################### def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False): """ Load TF 2.0 HDF5 checkpoint in a PyTorch model We use HDF5 to easily do transfer learning (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357). """ try: import tensorflow as tf # noqa: F401 import torch # noqa: F401 except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise import transformers logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path)) # Instantiate and load the associated TF 2.0 model tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beggining tf_model_class = getattr(transformers, tf_model_class_name) tf_model = tf_model_class(pt_model.config) if tf_inputs is None: tf_inputs = tf_model.dummy_inputs if tf_inputs is not None: tf_model(tf_inputs, training=False) # Make sure model is built tf_model.load_weights(tf_checkpoint_path, by_name=True) return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys) def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False): """ Load TF 2.0 model in a pytorch model """ weights = tf_model.weights return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys) def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False): """ Load TF2.0 symbolic weights in a PyTorch model """ try: import tensorflow as tf # noqa: F401 import torch # noqa: F401 except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise new_pt_params_dict = {} current_pt_params_dict = dict(pt_model.named_parameters()) # Make sure we are able to load PyTorch base models as well as derived models (with heads) # TF models always have a prefix, some of PyTorch models (base ones) don't start_prefix_to_remove = "" if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()): start_prefix_to_remove = pt_model.base_model_prefix + "." # Build a map from potential PyTorch weight names to TF 2.0 Variables tf_weights_map = {} for tf_weight in tf_weights: pt_name, transpose = convert_tf_weight_name_to_pt_weight_name( tf_weight.name, start_prefix_to_remove=start_prefix_to_remove ) tf_weights_map[pt_name] = (tf_weight.numpy(), transpose) all_tf_weights = set(list(tf_weights_map.keys())) loaded_pt_weights_data_ptr = {} missing_keys_pt = [] for pt_weight_name, pt_weight in current_pt_params_dict.items(): # Handle PyTorch shared weight ()not duplicated in TF 2.0 if pt_weight.data_ptr() in loaded_pt_weights_data_ptr: new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()] continue # Find associated numpy array in pytorch model state dict if pt_weight_name not in tf_weights_map: if allow_missing_keys: missing_keys_pt.append(pt_weight_name) continue raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name)) array, transpose = tf_weights_map[pt_weight_name] if transpose: array = numpy.transpose(array) if len(pt_weight.shape) < len(array.shape): array = numpy.squeeze(array) elif len(pt_weight.shape) > len(array.shape): array = numpy.expand_dims(array, axis=0) try: assert list(pt_weight.shape) == list(array.shape) except AssertionError as e: e.args += (pt_weight.shape, array.shape) raise e # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name)) new_pt_params_dict[pt_weight_name] = torch.from_numpy(array) loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array) all_tf_weights.discard(pt_weight_name) missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False) missing_keys += missing_keys_pt if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from TF 2.0 model: {}".format(pt_model.__class__.__name__, missing_keys) ) if len(unexpected_keys) > 0: logger.info( "Weights from TF 2.0 model not used in {}: {}".format(pt_model.__class__.__name__, unexpected_keys) ) logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights)) return pt_model ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 RoBERTa model. """ import logging import tensorflow as tf from .configuration_roberta import RobertaConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list logger = logging.getLogger(__name__) TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "roberta-base", "roberta-large", "roberta-large-mnli", "distilroberta-base", # See all RoBERTa models at https://huggingface.co/models?filter=roberta ] class TFRobertaEmbeddings(TFBertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.padding_idx = 1 def create_position_ids_from_input_ids(self, x): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param tf.Tensor x: :return tf.Tensor: """ mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) incremental_indicies = tf.math.cumsum(mask, axis=1) * mask return incremental_indicies + self.padding_idx def create_position_ids_from_inputs_embeds(self, inputs_embeds): """ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. :param tf.Tensor inputs_embeds: :return tf.Tensor: """ seq_length = shape_list(inputs_embeds)[1] position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] return position_ids def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = self.create_position_ids_from_input_ids(input_ids) else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) class TFRobertaMainLayer(TFBertMainLayer): """ Same as TFBertMainLayer but uses TFRobertaEmbeddings. """ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.embeddings = TFRobertaEmbeddings(config, name="embeddings") def get_input_embeddings(self): return self.embeddings class TFRobertaPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = RobertaConfig base_model_prefix = "roberta" ROBERTA_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.RobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ ROBERTA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.RobertaTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`__ position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", ROBERTA_START_DOCSTRING, ) class TFRobertaModel(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaModel tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaModel.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.roberta(inputs, **kwargs) return outputs class TFRobertaLMHead(tf.keras.layers.Layer): """Roberta Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = tf.keras.layers.Activation(gelu) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, features): x = self.dense(features) x = self.act(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x, mode="linear") + self.bias return x @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") def get_output_embeddings(self): return self.lm_head.decoder @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForMaskedLM tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForMaskedLM.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions) class TFRobertaClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x, training=training) x = self.dense(x) x = self.dropout(x, training=training) x = self.out_proj(x) return x @add_start_docstrings( """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ROBERTA_START_DOCSTRING, ) class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.classifier = TFRobertaClassificationHead(config, name="classifier") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForSequenceClassification tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForSequenceClassification.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.constant([1])[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] logits = self.classifier(sequence_output, training=kwargs.get("training", False)) outputs = (logits,) + outputs[2:] return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ROBERTA_START_DOCSTRING, ) class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForTokenClassification tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForTokenClassification.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions) @add_start_docstrings( """RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ROBERTA_START_DOCSTRING, ) class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.RobertaConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint roberta-base is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. import tensorflow as tf from transformers1 import RobertaTokenizer, TFRobertaForQuestionAnswering tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base') input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet") start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1 all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1]) """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_t5.py ================================================ # coding=utf-8 # Copyright 2018 T5 Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 T5 model. """ import copy import itertools import logging import math import tensorflow as tf from .configuration_t5 import T5Config from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list logger = logging.getLogger(__name__) TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", # See all T5 models at https://huggingface.co/models?filter=t5 ] #################################################### # TF 2.0 Models are constructed using Keras imperative API by sub-classing # - tf.keras.layers.Layer for the layers and # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model) #################################################### class TFT5LayerNorm(tf.keras.layers.Layer): def __init__(self, epsilon=1e-6, **kwargs): """ Construct a layernorm module in the T5 style No bias and no substraction of mean. """ super().__init__(**kwargs) self.variance_epsilon = epsilon def build(self, input_shape): """Build shared word embedding layer """ self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones") super().build(input_shape) def call(self, x): variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True) x = x * tf.math.rsqrt(variance + self.variance_epsilon) return self.weight * x class TFT5DenseReluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi") self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.act = tf.keras.activations.relu def call(self, hidden_states, training=False): h = self.wi(hidden_states) h = self.act(h) h = self.dropout(h, training=training) h = self.wo(h) return h class TFT5LayerFF(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense") self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call(self, hidden_states, training=False): norm_x = self.layer_norm(hidden_states) y = self.DenseReluDense(norm_x, training=training) layer_output = hidden_states + self.dropout(y, training=training) return layer_output class TFT5Attention(tf.keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.layer_id = next(TFT5Attention.NEW_ID) self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets self.d_model = config.d_model self.d_kv = config.d_kv self.n_heads = config.num_heads self.inner_dim = self.n_heads * self.d_kv # Mesh TensorFlow initialization to avoid scaling before softmax self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="q") self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="k") self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="v") self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name="o") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) if self.has_relative_attention_bias: self.relative_attention_bias = tf.keras.layers.Embedding( self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias", ) self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError @staticmethod def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 Translate relative position to a bucket number for relative attention. The relative position is defined as memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should allow for more graceful generalization to longer sequences than the model has been trained on. Args: relative_position: an int32 Tensor bidirectional: a boolean - whether the attention is bidirectional num_buckets: an integer max_distance: an integer Returns: a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) """ ret = 0 n = -relative_position if bidirectional: num_buckets //= 2 ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets n = tf.math.abs(n) else: n = tf.math.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = tf.math.less(n, max_exact) val_if_large = max_exact + tf.dtypes.cast( tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32, ) val_if_large = tf.math.minimum(val_if_large, num_buckets - 1) ret += tf.where(is_small, n, val_if_large) return ret def compute_bias(self, qlen, klen): """ Compute binned relative position bias """ context_position = tf.range(qlen)[:, None] memory_position = tf.range(klen)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) rp_bucket = self._relative_position_bucket( relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, ) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen) return values def call( self, input, mask=None, kv=None, position_bias=None, cache=None, past_key_value_state=None, head_mask=None, query_length=None, use_cache=False, training=False, ): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) # past_key_value_state[0] is (bs, n_heads, q_len - 1, dim_per_head) bs, qlen, dim = shape_list(input) if past_key_value_state is not None: assert self.is_decoder is True, "Encoder cannot cache past key value states" assert ( len(past_key_value_state) == 2 ), "past_key_value_state should have 2 past states: keys and values. Got {} past states".format( len(past_key_value_state) ) real_qlen = qlen + shape_list(past_key_value_state[0])[2] if query_length is None else query_length else: real_qlen = qlen if kv is None: klen = real_qlen else: klen = shape_list(kv)[1] def shape(x): """ projection """ return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3)) def unshape(x): """ compute context """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim)) q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) elif past_key_value_state is None: k = v = kv k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) if past_key_value_state is not None: if kv is None: k_, v_ = past_key_value_state k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) else: k, v = past_key_value_state # to cope with keras serialization # we need to cast `use_cache` to correct bool # if it is a tensor if tf.is_tensor(use_cache): if hasattr(use_cache, "numpy"): use_cache = bool(use_cache.numpy()) else: use_cache = True if self.is_decoder and use_cache is True: present_key_value_state = ((k, v),) else: present_key_value_state = (None,) scores = tf.einsum("bnqd,bnkd->bnqk", q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(real_qlen, klen) # if key and values are already calculated # we want only the last query position bias if past_key_value_state is not None: position_bias = position_bias[:, :, -1:, :] if mask is not None: position_bias = position_bias + mask # (bs, n_heads, qlen, klen) scores += position_bias weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) context = self.o(context) outputs = (context,) + present_key_value_state if self.output_attentions: outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) return outputs class TFT5LayerSelfAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.SelfAttention = TFT5Attention( config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention", ) self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call( self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, training=False, ): norm_x = self.layer_norm(hidden_states) attention_output = self.SelfAttention( norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, use_cache=use_cache, training=training, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y, training=training) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class TFT5LayerCrossAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.EncDecAttention = TFT5Attention( config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention", ) self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call( self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, past_key_value_state=None, query_length=None, use_cache=False, training=False, ): norm_x = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, past_key_value_state=past_key_value_state, query_length=query_length, use_cache=use_cache, training=training, ) y = attention_output[0] layer_output = hidden_states + self.dropout(y, training=training) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them return outputs class TFT5Block(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.is_decoder = config.is_decoder self.layer = [] self.layer.append( TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0",) ) if self.is_decoder: self.layer.append( TFT5LayerCrossAttention( config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1", ) ) self.layer.append(TFT5LayerFF(config, name="layer_._{}".format(len(self.layer)))) def call( self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, head_mask=None, past_key_value_state=None, use_cache=False, training=False, ): if past_key_value_state is not None: assert self.is_decoder, "Only decoder can use `past_key_value_states`" expected_num_past_key_value_states = 2 if encoder_hidden_states is None else 4 error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format( expected_num_past_key_value_states, "2 (past / key) for cross attention" if expected_num_past_key_value_states == 4 else "", len(past_key_value_state), ) assert len(past_key_value_state) == expected_num_past_key_value_states, error_message self_attn_past_key_value_state = past_key_value_state[:2] cross_attn_past_key_value_state = past_key_value_state[2:] else: self_attn_past_key_value_state, cross_attn_past_key_value_state = None, None self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask, past_key_value_state=self_attn_past_key_value_state, use_cache=use_cache, training=training, ) hidden_states, present_key_value_state = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights if self.is_decoder and encoder_hidden_states is not None: # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: query_length = shape_list(present_key_value_state[0])[2] else: query_length = None cross_attention_outputs = self.layer[1]( hidden_states, kv=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, head_mask=head_mask, past_key_value_state=cross_attn_past_key_value_state, query_length=query_length, use_cache=use_cache, training=training, ) hidden_states = cross_attention_outputs[0] # Combine self attn and cross attn key value states if present_key_value_state is not None: present_key_value_state = present_key_value_state + cross_attention_outputs[1] # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states, training=training) outputs = (hidden_states,) # Add attentions if we output them outputs = outputs + (present_key_value_state,) + attention_outputs return outputs # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) class _NoLayerEmbedTokens(object): """ this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer' class to avoid problem with weight restoring. Also it makes sure that the layer is called from the correct scope to avoid problem with saving/storing the correct weights """ def __init__(self, layer, abs_scope_name=None): self._layer = layer self._abs_scope_name = abs_scope_name def call(self, inputs, mode="embedding"): if self._abs_scope_name is None: return self._layer.call(inputs, mode) # if an abs scope name is given to the embedding variable, call variable from absolute scope with tf.compat.v1.variable_scope(self._abs_scope_name, auxiliary_name_scope=False) as abs_scope_name: with tf.name_scope(abs_scope_name.original_name_scope): return self._layer.call(inputs, mode) def __call__(self, inputs, mode="embedding"): if self._abs_scope_name is None: return self._layer(inputs, mode) # if an abs scope name is given to the embedding variable, call variable from absolute scope with tf.compat.v1.variable_scope(self._abs_scope_name, auxiliary_name_scope=False) as abs_scope_name: with tf.name_scope(abs_scope_name.original_name_scope): return self._layer(inputs, mode) #################################################### # The full model without a specific pretrained or finetuning head is # provided as a tf.keras.layers.Layer usually called "TFT5MainLayer" #################################################### class TFT5MainLayer(tf.keras.layers.Layer): def __init__(self, config, embed_tokens=None, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.embed_tokens = embed_tokens self.is_decoder = config.is_decoder self.config = config self.num_hidden_layers = config.num_layers self.block = [ TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i),) for i in range(config.num_layers) ] self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def get_input_embeddings(self): return self.embed_tokens def get_output_embeddings(self): return self.embed_tokens def set_embed_tokens(self, embed_tokens): self.embed_tokens = embed_tokens def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models def call( self, inputs, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, inputs_embeds=None, head_mask=None, past_key_value_states=None, use_cache=False, training=False, ): if inputs is not None and inputs_embeds is not None: raise ValueError("You cannot specify both inputs and inputs_embeds at the same time") elif inputs is not None: input_shape = shape_list(inputs) inputs = tf.reshape(inputs, (-1, input_shape[-1])) elif inputs_embeds is not None: input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either inputs or inputs_embeds") if inputs_embeds is None: assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" inputs_embeds = self.embed_tokens(inputs) batch_size, seq_length = input_shape if past_key_value_states is not None: assert seq_length == 1, "Input shape is {}, but should be {} when using past_key_value_sates".format( input_shape, (batch_size, 1) ) # required mask seq length can be calculated via length of past # key value states and seq_length = 1 for the last token mask_seq_length = shape_list(past_key_value_states[0][0])[2] + seq_length else: mask_seq_length = seq_length if attention_mask is None: attention_mask = tf.fill((batch_size, mask_seq_length), 1) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: encoder_seq_length = shape_list(encoder_hidden_states)[1] encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1) # initialize past_key_value_states with `None` if past does not exist if past_key_value_states is None: past_key_value_states = [None] * len(self.block) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. attention_mask = tf.cast(attention_mask, dtype=tf.float32) num_dims_attention_mask = len(shape_list(attention_mask)) if num_dims_attention_mask == 3: extended_attention_mask = attention_mask[:, None, :, :] elif num_dims_attention_mask == 2: # Provided a padding mask of dimensions [batch_size, mask_seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] if self.is_decoder: seq_ids = tf.range(mask_seq_length) causal_mask = tf.less_equal( tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), seq_ids[None, :, None], ) causal_mask = tf.cast(causal_mask, dtype=tf.float32) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] if past_key_value_states[0] is not None: extended_attention_mask = extended_attention_mask[:, :, -1:, :] else: extended_attention_mask = attention_mask[:, None, None, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 # extended_attention_mask = tf.math.equal(extended_attention_mask, # tf.transpose(extended_attention_mask, perm=(-1, -2))) extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 if self.is_decoder and encoder_attention_mask is not None: # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, mask_seq_length, mask_seq_length] # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32) num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) if num_dims_encoder_attention_mask == 3: encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if num_dims_encoder_attention_mask == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) present_key_value_states = () all_hidden_states = () all_attentions = () position_bias = None encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds, training=training) for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask=extended_attention_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i], past_key_value_state=past_key_value_state, use_cache=use_cache, training=training, ) # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states, present_key_value_state = layer_outputs[:2] if i == 0: # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) position_bias = layer_outputs[3 if self.output_attentions else 2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[5 if self.output_attentions else 3] # append next layer key value states present_key_value_states = present_key_value_states + (present_key_value_state,) if self.output_attentions: all_attentions = all_attentions + (layer_outputs[2],) hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states, training=training) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if use_cache is True: assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) outputs = outputs + (present_key_value_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) #################################################### # TFT5PreTrainedModel is a sub-class of tf.keras.Model # which take care of loading and saving pretrained weights # and various common utilities. # Here you just need to specify a few (self-explanatory) # pointers for your model. #################################################### class TFT5PreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = T5Config base_model_prefix = "transformer" @property def dummy_inputs(self): inputs = tf.constant(DUMMY_INPUTS) input_mask = tf.constant(DUMMY_MASK) dummy_inputs = { "inputs": inputs, "decoder_input_ids": inputs, "decoder_attention_mask": input_mask, } return dummy_inputs T5_START_DOCSTRING = r""" The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting. This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`: https://arxiv.org/abs/1910.10683 .. _`tf.keras.Model`: https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model Note on the model inputs: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with inputs only and nothing else: `model(inputs_ids) - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: `model([inputs, attention_mask])` or `model([inputs, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associaed to the input names given in the docstring: `model({'inputs': inputs, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.T5Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ T5_INPUTS_DOCSTRING = r""" Args: inputs are usually used as a `dict` (see T5 description above for more information) containing all the following. inputs (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you should be able to pad the inputs on the right or the left. Indices can be obtained using :class:`transformers1.T5Tokenizer`. To know more on how to prepare :obj:`inputs` for pre-training take a look at `T5 Training <./t5.html#training>`_ . See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.convert_tokens_to_ids` for details. decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`): Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation. If `decoder_past_key_value_states` is used, optionally only the last `decoder_input_ids` have to be input (see `decoder_past_key_value_states`). attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`, defaults to :obj:`None`): Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`) `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`): Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default. decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up decoding. If `decoder_past_key_value_states` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): If `use_cache` is True, `decoder_past_key_value_states` are returned and can be used to speed up decoding (see `decoder_past_key_value_states`). inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`inputs` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `inputs` indices into associated vectors than the model's internal embedding lookup matrix. decoder_inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at `T5 Training <./t5.html#training>`_ . head_mask: (:obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ @add_start_docstrings( "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.", T5_START_DOCSTRING, ) class TFT5Model(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") # retrieve correct absolute scope for embed token wrapper with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name: pass embed_tokens = _NoLayerEmbedTokens(self.shared, abs_scope_name=shared_abs_scope_name) encoder_config = copy.deepcopy(config) self.encoder = TFT5MainLayer(encoder_config, embed_tokens, name="encoder") decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = TFT5MainLayer(decoder_config, embed_tokens, name="decoder") def get_input_embeddings(self): return self.shared def get_output_embeddings(self): return self.shared def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import T5Tokenizer, TFT5Model tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5Model.from_pretrained('t5-small') inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 outputs = model(inputs, decoder_input_ids=inputs) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if isinstance(inputs, dict): kwargs.update(inputs) else: kwargs["inputs"] = inputs # retrieve arguments inputs = kwargs.get("inputs", None) inputs_embeds = kwargs.get("inputs_embeds", None) attention_mask = kwargs.get("attention_mask", None) encoder_outputs = kwargs.get("encoder_outputs", None) decoder_input_ids = kwargs.get("decoder_input_ids", None) decoder_attention_mask = kwargs.get("decoder_attention_mask", None) decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None) decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None) use_cache = kwargs.get("use_cache", True) head_mask = kwargs.get("head_mask", None) # Encode if needed (training, first prediction pass) if encoder_outputs is None: encoder_outputs = self.encoder( inputs, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, ) hidden_states = encoder_outputs[0] # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] return decoder_outputs + encoder_outputs @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) class TFT5ForConditionalGeneration(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.model_dim = config.d_model self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") # retrieve correct absolute scope for embed token wrapper with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name: pass embed_tokens = _NoLayerEmbedTokens(self.shared, abs_scope_name=shared_abs_scope_name) encoder_config = copy.deepcopy(config) self.encoder = TFT5MainLayer(encoder_config, embed_tokens, name="encoder") decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True self.decoder = TFT5MainLayer(decoder_config, embed_tokens, name="decoder") def get_input_embeddings(self): return self.shared def get_output_embeddings(self): return self.shared def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.T5Config`) and inputs. loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): Classification loss (cross entropy). prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): Contains pre-computed key and value hidden-states of the attention blocks. Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention. Examples:: from transformers1 import T5Tokenizer, TFT5ForConditionalGeneration tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5ForConditionalGeneration.from_pretrained('t5-small') inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 outputs = model(inputs, decoder_input_ids=inputs) prediction_scores = outputs[0] tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5ForConditionalGeneration.from_pretrained('t5-small') inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1 model.generate(inputs) """ if isinstance(inputs, dict): kwargs.update(inputs) else: kwargs["inputs"] = inputs # retrieve arguments inputs = kwargs.get("inputs", None) decoder_input_ids = kwargs.get("decoder_input_ids", None) attention_mask = kwargs.get("attention_mask", None) encoder_outputs = kwargs.get("encoder_outputs", None) decoder_attention_mask = kwargs.get("decoder_attention_mask", None) decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None) use_cache = kwargs.get("use_cache", True) inputs_embeds = kwargs.get("inputs_embeds", None) decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None) head_mask = kwargs.get("head_mask", None) # Encode if needed (training, first prediction pass) if encoder_outputs is None: # Convert encoder inputs in embeddings if needed encoder_outputs = self.encoder( inputs, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, ) hidden_states = encoder_outputs[0] # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: decoder_inputs_embeds = decoder_inputs_embeds[:, -1:] # Decode decoder_outputs = self.decoder( decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_value_states=decoder_past_key_value_states, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=head_mask, use_cache=use_cache, ) # insert decoder past at right place # to speed up decoding if use_cache is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) embed_tokens = self.get_output_embeddings() lm_logits = embed_tokens(sequence_output, mode="linear") decoder_outputs = (lm_logits,) + decoder_outputs[1:] return decoder_outputs + encoder_outputs def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" # first step if len(past) < 2: encoder_outputs, decoder_past_key_value_states = past, None else: encoder_outputs, decoder_past_key_value_states = past[0], past[1] return { "inputs": None, # inputs don't have to be defined, but still need to be passed to make Keras.layer.__call__ happy "decoder_input_ids": inputs, # inputs are the decoder_input_ids "decoder_past_key_value_states": decoder_past_key_value_states, "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "use_cache": use_cache, } def _reorder_cache(self, past, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder if len(past) < 2: logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") return past decoder_past = past[1] past = (past[0],) reordered_decoder_past = () for layer_past_states in decoder_past: # get the correct batch idx from layer past batch dim # batch dim of `past` is at 2nd position reordered_layer_past_states = () for layer_past_state in layer_past_states: # need to set correct `past` for each of the four key / value states reordered_layer_past_states = reordered_layer_past_states + (tf.gather(layer_past_state, beam_idx),) assert shape_list(reordered_layer_past_states[0]) == shape_list(layer_past_states[0]) assert len(reordered_layer_past_states) == len(layer_past_states) reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return past + (reordered_decoder_past,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 Transformer XL model. """ import logging import tensorflow as tf from .configuration_transfo_xl import TransfoXLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "transfo-xl-wt103", # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl ] class TFPositionalEmbedding(tf.keras.layers.Layer): def __init__(self, demb, **kwargs): super().__init__(**kwargs) self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb)) def call(self, pos_seq, bsz=None): sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq) pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1) if bsz is not None: return tf.tile(pos_emb[:, None, :], [1, bsz, 1]) else: return pos_emb[:, None, :] class TFPositionwiseFF(tf.keras.layers.Layer): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): super().__init__(**kwargs) self.d_model = d_model self.d_inner = d_inner self.dropout = dropout self.layer_1 = tf.keras.layers.Dense( d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0" ) self.drop_1 = tf.keras.layers.Dropout(dropout) self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3") self.drop_2 = tf.keras.layers.Dropout(dropout) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.pre_lnorm = pre_lnorm def call(self, inp, training=False): if self.pre_lnorm: # layer normalization + positionwise feed-forward core_out = self.layer_norm(inp) core_out = self.layer_1(core_out) core_out = self.drop_1(core_out, training=training) core_out = self.layer_2(core_out) core_out = self.drop_2(core_out, training=training) # residual connection output = core_out + inp else: # positionwise feed-forward core_out = self.layer_1(inp) core_out = self.drop_1(core_out, training=training) core_out = self.layer_2(core_out) core_out = self.drop_2(core_out, training=training) # residual connection + layer normalization output = self.layer_norm(inp + core_out) return output class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): def __init__( self, n_head, d_model, d_head, dropout, dropatt=0, tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs ): super().__init__(**kwargs) self.output_attentions = output_attentions self.n_head = n_head self.d_model = d_model self.d_head = d_head self.dropout = dropout self.qkv_net = tf.keras.layers.Dense( 3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net" ) self.drop = tf.keras.layers.Dropout(dropout) self.dropatt = tf.keras.layers.Dropout(dropatt) self.o_net = tf.keras.layers.Dense( d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net" ) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.scale = 1 / (d_head ** 0.5) self.pre_lnorm = pre_lnorm if r_r_bias is not None and r_w_bias is not None: # Biases are shared self.r_r_bias = r_r_bias self.r_w_bias = r_w_bias else: self.r_r_bias = None self.r_w_bias = None self.r_net = tf.keras.layers.Dense( self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net" ) def build(self, input_shape): if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared self.r_r_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" ) self.r_w_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" ) super().build(input_shape) def _rel_shift(self, x): x_size = shape_list(x) x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]]) x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]]) x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1]) x = tf.reshape(x, x_size) return x def call(self, inputs, training=False): w, r, attn_mask, mems, head_mask = inputs qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1] if mems is not None: cat = tf.concat([mems, w], 0) if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(cat)) else: w_heads = self.qkv_net(cat) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) w_head_q = w_head_q[-qlen:] else: if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(w)) else: w_heads = self.qkv_net(w) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) klen = shape_list(w_head_k)[0] w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head # compute attention score rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k) # qlen x klen x bsz x n_head rr_head_q = w_head_q + self.r_r_bias BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k) # qlen x klen x bsz x n_head BD = self._rel_shift(BD) # [qlen x klen x bsz x n_head] attn_score = AC + BD attn_score = attn_score * self.scale # compute attention probability if attn_mask is not None: attn_mask_t = attn_mask[:, :, None, None] attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t # [qlen x klen x bsz x n_head] attn_prob = tf.nn.softmax(attn_score, axis=1) attn_prob = self.dropatt(attn_prob, training=training) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * head_mask # compute attention vector attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec_sizes = shape_list(attn_vec) attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head)) # linear projection attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out, training=training) if self.pre_lnorm: # residual connection outputs = [w + attn_out] else: # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] if self.output_attentions: outputs.append(attn_prob) return outputs class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): def __init__( self, n_head, d_model, d_head, d_inner, dropout, tgt_len=None, ext_len=None, mem_len=None, dropatt=0.0, pre_lnorm=False, r_w_bias=None, r_r_bias=None, output_attentions=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs ): super().__init__(**kwargs) self.dec_attn = TFRelPartialLearnableMultiHeadAttn( n_head, d_model, d_head, dropout, tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm, r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std, output_attentions=output_attentions, layer_norm_epsilon=layer_norm_epsilon, name="dec_attn", ) self.pos_ff = TFPositionwiseFF( d_model, d_inner, dropout, pre_lnorm=pre_lnorm, init_std=init_std, layer_norm_epsilon=layer_norm_epsilon, name="pos_ff", ) def call(self, inputs, training=False): dec_inp, r, dec_attn_mask, mems, head_mask = inputs attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask, mems, head_mask], training=training) ff_output = self.pos_ff(attn_outputs[0], training=training) outputs = [ff_output] + attn_outputs[1:] return outputs class TFAdaptiveEmbedding(tf.keras.layers.Layer): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): super().__init__(**kwargs) self.n_token = n_token self.d_embed = d_embed self.init_std = init_std self.cutoffs = cutoffs + [n_token] self.div_val = div_val self.d_proj = d_proj self.emb_scale = d_proj ** 0.5 self.cutoff_ends = [0] + self.cutoffs self.emb_layers = [] self.emb_projs = [] if div_val == 1: raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) self.emb_layers.append( tf.keras.layers.Embedding( r_idx - l_idx, d_emb_i, embeddings_initializer=get_initializer(init_std), name="emb_layers_._{}".format(i), ) ) def build(self, input_shape): for i in range(len(self.cutoffs)): d_emb_i = self.d_embed // (self.div_val ** i) self.emb_projs.append( self.add_weight( shape=(d_emb_i, self.d_proj), initializer=get_initializer(self.init_std), trainable=True, name="emb_projs_._{}".format(i), ) ) super().build(input_shape) def call(self, inp): if self.div_val == 1: raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint else: inp_flat = tf.reshape(inp, (-1,)) emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj]) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx emb_i = self.emb_layers[i](inp_i) emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i]) mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64) emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64)) embed_shape = shape_list(inp) + [self.d_proj] embed = tf.reshape(emb_flat, embed_shape) embed *= self.emb_scale return embed @keras_serializable class TFTransfoXLMainLayer(tf.keras.layers.Layer): config_class = TransfoXLConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.n_token = config.vocab_size self.d_embed = config.d_embed self.d_model = config.d_model self.n_head = config.n_head self.d_head = config.d_head self.untie_r = config.untie_r self.word_emb = TFAdaptiveEmbedding( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, init_std=config.init_std, name="word_emb", ) self.drop = tf.keras.layers.Dropout(config.dropout) self.n_layer = config.n_layer self.tgt_len = config.tgt_len self.mem_len = config.mem_len self.ext_len = config.ext_len self.max_klen = config.tgt_len + config.ext_len + config.mem_len self.attn_type = config.attn_type self.layers = [] if config.attn_type == 0: # the default attention for i in range(config.n_layer): self.layers.append( TFRelPartialLearnableDecoderLayer( config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, r_w_bias=None if self.untie_r else self.r_w_bias, r_r_bias=None if self.untie_r else self.r_r_bias, output_attentions=self.output_attentions, layer_norm_epsilon=config.layer_norm_epsilon, init_std=config.init_std, name="layers_._{}".format(i), ) ) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint self.same_length = config.same_length self.clamp_len = config.clamp_len if self.attn_type == 0: # default attention self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb") else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint def build(self, input_shape): if not self.untie_r: self.r_w_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" ) self.r_r_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" ) super().build(input_shape) def get_input_embeddings(self): return self.word_emb def _resize_token_embeddings(self, new_num_tokens): return self.word_emb def backward_compatible(self): self.sample_softmax = -1 def reset_length(self, tgt_len, ext_len, mem_len): self.tgt_len = tgt_len self.mem_len = mem_len self.ext_len = ext_len def _prune_heads(self, heads): raise NotImplementedError def init_mems(self, bsz): if self.mem_len > 0: mems = [] for i in range(self.n_layer): empty = tf.zeros([self.mem_len, bsz, self.d_model]) mems.append(empty) return mems else: return None def _update_mems(self, hids, mems, mlen, qlen): # does not deal with None if mems is None: return None # mems is not None assert len(hids) == len(mems), "len(hids) != len(mems)" # There are `mlen + qlen` steps that can be cached into mems # For the next step, the last `ext_len` of the `qlen` tokens # will be used as the extended context. Hence, we only cache # the tokens from `mlen + qlen - self.ext_len - self.mem_len` # to `mlen + qlen - self.ext_len`. new_mems = [] end_idx = mlen + max(0, qlen - 0 - self.ext_len) beg_idx = max(0, end_idx - self.mem_len) for i in range(len(hids)): cat = tf.concat([mems[i], hids[i]], axis=0) tf.stop_gradient(cat) new_mems.append(cat[beg_idx:end_idx]) return new_mems def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = tf.transpose(input_ids, perm=(1, 0)) qlen, bsz = shape_list(input_ids) elif inputs_embeds is not None: inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2)) qlen, bsz = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if mems is None: mems = self.init_mems(bsz) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layer if inputs_embeds is not None: word_emb = inputs_embeds else: word_emb = self.word_emb(input_ids) mlen = shape_list(mems[0])[0] if mems is not None else 0 klen = mlen + qlen attn_mask = tf.ones([qlen, qlen]) mask_u = tf.linalg.band_part(attn_mask, 0, -1) mask_dia = tf.linalg.band_part(attn_mask, 0, 0) attn_mask_pad = tf.zeros([qlen, mlen]) dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) if self.same_length: mask_l = tf.linalg.band_part(attn_mask, -1, 0) dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, dec_attn_mask[:, qlen:]], 1) # ::: PyTorch masking code for reference ::: # if self.same_length: # all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) # mask_len = klen - self.mem_len # if mask_len > 0: # mask_shift_len = qlen - mask_len # else: # mask_shift_len = qlen # dec_attn_mask = (torch.triu(all_ones, 1+mlen) # + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 # else: # dec_attn_mask = torch.triu( # word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] hids = [] attentions = [] if self.attn_type == 0: # default pos_seq = tf.range(klen - 1, -1, -1.0) if self.clamp_len > 0: pos_seq = tf.minimum(pos_seq, self.clamp_len) pos_emb = self.pos_emb(pos_seq) core_out = self.drop(word_emb, training=training) pos_emb = self.drop(pos_emb, training=training) for i, layer in enumerate(self.layers): hids.append(core_out) mems_i = None if mems is None else mems[i] layer_outputs = layer([core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i]], training=training) core_out = layer_outputs[0] if self.output_attentions: attentions.append(layer_outputs[1]) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint core_out = self.drop(core_out, training=training) new_mems = self._update_mems(hids, mems, mlen, qlen) # We transpose back here to shape [bsz, len, hidden_dim] outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems] if self.output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids) outputs.append(hids) if self.output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) outputs.append(attentions) return outputs # last hidden state, new_mems, (all hidden states), (all attentions) class TFTransfoXLPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = TransfoXLConfig base_model_prefix = "transformer" TRANSFO_XL_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.TransfoXLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ TRANSFO_XL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.TransfoXLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", TRANSFO_XL_START_DOCSTRING, ) class TFTransfoXLModel(TFTransfoXLPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFTransfoXLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import TransfoXLTokenizer, TFTransfoXLModel tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states, mems = outputs[:2] """ outputs = self.transformer(inputs, **kwargs) return outputs class TFTransfoXLLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive input embeddings)""", TRANSFO_XL_START_DOCSTRING, ) class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.sample_softmax = config.sample_softmax assert ( self.sample_softmax <= 0 ), "Sampling from the softmax is not implemented yet. Please look at issue: #3310: https://github.com/huggingface/transformers/issues/3310" self.crit = TFAdaptiveSoftmaxMask( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit" ) def get_output_embeddings(self): """ Double-check if you are using adaptive softmax. """ if len(self.crit.out_layers) > 0: return self.crit.out_layers[-1] return None def reset_length(self, tgt_len, ext_len, mem_len): self.transformer.reset_length(tgt_len, ext_len, mem_len) def init_mems(self, bsz): return self.transformer.init_mems(bsz) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, labels=None, training=False): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import TransfoXLTokenizer, TFTransfoXLLMHeadModel tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, mems = outputs[:2] """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds labels = inputs[4] if len(inputs) > 4 else labels assert len(inputs) <= 5, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) labels = inputs.get("labels", labels) assert len(inputs) <= 5, "Too many inputs." else: input_ids = inputs if input_ids is not None: bsz, tgt_len = shape_list(input_ids)[:2] else: bsz, tgt_len = shape_list(inputs_embeds)[:2] transformer_outputs = self.transformer([input_ids, mems, head_mask, inputs_embeds], training=training) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] outputs = transformer_outputs[1:] softmax_output = self.crit([pred_hid, labels], training=training) outputs = [softmax_output] + outputs return outputs # logits, new_mems, (all hidden states), (all attentions) def prepare_inputs_for_generation(self, inputs, past, **model_kwargs): inputs = {"inputs": inputs} # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_transfo_xl_utilities.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ A TF 2.0 Adaptive Softmax for Transformer XL model. """ import tensorflow as tf from .modeling_tf_utils import shape_list class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): super().__init__(**kwargs) self.vocab_size = vocab_size self.d_embed = d_embed self.d_proj = d_proj self.cutoffs = cutoffs + [vocab_size] self.cutoff_ends = [0] + self.cutoffs self.div_val = div_val self.shortlist_size = self.cutoffs[0] self.n_clusters = len(self.cutoffs) - 1 self.head_size = self.shortlist_size + self.n_clusters self.keep_order = keep_order self.out_layers = [] self.out_projs = [] def build(self, input_shape): if self.n_clusters > 0: self.cluster_weight = self.add_weight( shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight" ) self.cluster_bias = self.add_weight( shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias" ) if self.div_val == 1: for i in range(len(self.cutoffs)): if self.d_proj != self.d_embed: weight = self.add_weight( shape=(self.d_embed, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i), ) self.out_projs.append(weight) else: self.out_projs.append(None) weight = self.add_weight( shape=(self.vocab_size, self.d_embed,), initializer="zeros", trainable=True, name="out_layers_._{}_._weight".format(i), ) bias = self.add_weight( shape=(self.vocab_size,), initializer="zeros", trainable=True, name="out_layers_._{}_._bias".format(i), ) self.out_layers.append((weight, bias)) else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = self.d_embed // (self.div_val ** i) weight = self.add_weight( shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i) ) self.out_projs.append(weight) weight = self.add_weight( shape=(r_idx - l_idx, d_emb_i,), initializer="zeros", trainable=True, name="out_layers_._{}_._weight".format(i), ) bias = self.add_weight( shape=(r_idx - l_idx,), initializer="zeros", trainable=True, name="out_layers_._{}_._bias".format(i), ) self.out_layers.append((weight, bias)) super().build(input_shape) @staticmethod def _logit(x, W, b, proj=None): y = x if proj is not None: y = tf.einsum("ibd,ed->ibe", y, proj) return tf.einsum("ibd,nd->ibn", y, W) + b @staticmethod def _gather_logprob(logprob, target): lp_size = shape_list(logprob) r = tf.range(lp_size[0]) idx = tf.stack([r, target], 1) return tf.gather_nd(logprob, idx) def call(self, inputs, return_mean=True, training=False): hidden, target = inputs head_logprob = 0 if self.n_clusters == 0: output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) if target is not None: loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) out = tf.nn.log_softmax(output, axis=-1) else: hidden_sizes = shape_list(hidden) out = [] loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] if target is not None: mask = (target >= l_idx) & (target < r_idx) mask_idx = tf.where(mask) cur_target = tf.boolean_mask(target, mask) - l_idx if self.div_val == 1: cur_W = self.out_layers[0][0][l_idx:r_idx] cur_b = self.out_layers[0][1][l_idx:r_idx] else: cur_W = self.out_layers[i][0] cur_b = self.out_layers[i][1] if i == 0: cur_W = tf.concat([cur_W, self.cluster_weight], 0) cur_b = tf.concat([cur_b, self.cluster_bias], 0) head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0]) head_logprob = tf.nn.log_softmax(head_logit) out.append(head_logprob[..., : self.cutoffs[0]]) if target is not None: cur_head_logprob = tf.boolean_mask(head_logprob, mask) cur_logprob = self._gather_logprob(cur_head_logprob, cur_target) else: tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i]) tail_logprob = tf.nn.log_softmax(tail_logit) cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob out.append(logprob_i) if target is not None: cur_head_logprob = tf.boolean_mask(head_logprob, mask) cur_tail_logprob = tf.boolean_mask(tail_logprob, mask) cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target) cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1] if target is not None: loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64)) out = tf.concat(out, axis=-1) if target is not None: if return_mean: loss = tf.reduce_mean(loss) # Add the training-time loss value to the layer using `self.add_loss()`. self.add_loss(loss) # Log the loss as a metric (we could log arbitrary metrics, # including different metrics for training and inference. self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "") return out ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """TF general model utils.""" import functools import logging import os import h5py import numpy as np import tensorflow as tf from tensorflow.python.keras.saving import hdf5_format from .configuration_utils import PretrainedConfig from .file_utils import DUMMY_INPUTS, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model logger = logging.getLogger(__name__) class TFModelUtilsMixin: """ A few utilities for `tf.keras.Model`s, to be used as a mixin. """ def num_parameters(self, only_trainable: bool = False) -> int: """ Get number of (optionally, trainable) parameters in the model. """ if only_trainable: return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables)) else: return self.count_params() def keras_serializable(cls): """ Decorate a Keras Layer class to support Keras serialization. This is done by: 1. adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at serialization time 2. wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and convert it to a config object for the actual layer initializer 3. registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model` :param cls: a tf.keras.layers.Layers subclass that accepts a `config` argument to its initializer (typically a `TF*MainLayer` class in this project) :return: the same class object, with modifications for Keras deserialization. """ initializer = cls.__init__ config_class = getattr(cls, "config_class", None) if config_class is None: raise AttributeError("Must set `config_class` to use @keras_serializable") @functools.wraps(initializer) def wrapped_init(self, *args, **kwargs): transformers_config = kwargs.pop("transformers_config", None) config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.get("config", None) if config is not None and transformers_config is not None: raise ValueError("Must pass either `config` or `transformers_config`, not both") elif config is not None: # normal layer construction, call with unchanged args (config is already in there) initializer(self, *args, **kwargs) elif transformers_config is not None: # Keras deserialization, convert dict to config config = config_class.from_dict(transformers_config) initializer(self, config, *args, **kwargs) else: raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)") self._transformers_config = config cls.__init__ = wrapped_init if not hasattr(cls, "get_config"): raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses") if hasattr(cls.get_config, "_is_default"): def get_config(self): cfg = super(cls, self).get_config() cfg["transformers_config"] = self._transformers_config.to_dict() return cfg cls.get_config = get_config cls._keras_serializable = True if hasattr(tf.keras.utils, "register_keras_serializable"): cls = tf.keras.utils.register_keras_serializable()(cls) return cls class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): r""" Base class for all TF models. :class:`~transformers1.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. Class attributes (overridden by derived classes): - ``config_class``: a class derived from :class:`~transformers1.PretrainedConfig` to use as configuration class for this model architecture. - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: - ``model``: an instance of the relevant subclass of :class:`~transformers1.PreTrainedModel`, - ``config``: an instance of the relevant subclass of :class:`~transformers1.PretrainedConfig`, - ``path``: a path (string) to the TensorFlow checkpoint. - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. """ config_class = None base_model_prefix = "" @property def dummy_inputs(self): """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ return {"input_ids": tf.constant(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) if not isinstance(config, PretrainedConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " "To create a model from a pretrained model use " "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ ) ) # Save config in model self.config = config def get_input_embeddings(self): """ Returns the model's input embeddings. Returns: :obj:`tf.keras.layers.Layer`: A torch module mapping vocabulary to hidden states. """ base_model = getattr(self, self.base_model_prefix, self) if base_model is not self: return base_model.get_input_embeddings() else: raise NotImplementedError def get_output_embeddings(self): """ Returns the model's output embeddings. Returns: :obj:`tf.keras.layers.Layer`: A torch module mapping hidden states to vocabulary. """ return None # Overwrite for models with output embeddings def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): """ Build a resized Embedding Variable from a provided token Embedding Module. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end Args: new_num_tokens: (`optional`) int New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end If not provided or None: return the provided token Embedding Module. Return: ``tf.Variable`` Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None """ # if new_num_tokens is None: # return old_embeddings # old_num_tokens, old_embedding_dim = old_embeddings.weight.size() # if old_num_tokens == new_num_tokens: # return old_embeddings # # Build new embeddings # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) # new_embeddings.to(old_embeddings.weight.device) # # initialize all new embeddings (in particular added tokens) # self._init_weights(new_embeddings) # # Copy token embeddings from the previous weights # num_tokens_to_copy = min(old_num_tokens, new_num_tokens) # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] # return new_embeddings def resize_token_embeddings(self, new_num_tokens=None): """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Arguments: new_num_tokens: (`optional`) int: New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model. Return: ``tf.Variable`` Pointer to the input tokens Embeddings Module of the model """ raise NotImplementedError def prune_heads(self, heads_to_prune): """ Prunes heads of the base model. Arguments: heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). """ raise NotImplementedError def save_pretrained(self, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the :func:`~transformers1.PreTrainedModel.from_pretrained` class method. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # Save configuration file self.config.save_pretrained(save_directory) # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME) self.save_weights(output_model_file) logger.info("Model weights saved in {}".format(output_model_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration. The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) one of: - an instance of a class derived from :class:`~transformers1.PretrainedConfig`, or - a string valid as input to :func:`~transformers1.PretrainedConfig.from_pretrained()` Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. from_pt: (`optional`) boolean, default False: Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument). cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: # For example purposes. Not runnable. model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config) """ config = kwargs.pop("config", None) cache_dir = kwargs.pop("cache_dir", None) from_pt = kwargs.pop("from_pt", False) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) use_cdn = kwargs.pop("use_cdn", True) # Load config if we don't provide a configuration if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( config_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, **kwargs, ) else: model_kwargs = kwargs # Load model if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): # Load from a TF 2.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: raise EnvironmentError( "Error no file named {} found in directory {} or `from_pt` set to False".format( [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path ) ) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): archive_file = pretrained_model_name_or_path + ".index" else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=(WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME), use_cdn=use_cdn, ) try: # Load from URL or cache if already cached resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, proxies=proxies, ) if resolved_archive_file is None: raise EnvironmentError except EnvironmentError: msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {TF2_WEIGHTS_NAME}, {WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) else: resolved_archive_file = None # Instantiate model. model = cls(config, *model_args, **model_kwargs) if from_pt: # Load from a PyTorch checkpoint return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True) model(model.dummy_inputs, training=False) # build the network with dummy inputs assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file) # 'by_name' allow us to do transfer learning by skipping/adding layers # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357 try: model.load_weights(resolved_archive_file, by_name=True) except OSError: raise OSError( "Unable to load weights from h5 file. " "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. " ) model(model.dummy_inputs, training=False) # Make sure restore ops are run # Check if the models are the same to output loading informations with h5py.File(resolved_archive_file, "r") as f: if "layer_names" not in f.attrs and "model_weights" in f: f = f["model_weights"] hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names")) model_layer_names = set(layer.name for layer in model.layers) missing_keys = list(model_layer_names - hdf5_layer_names) unexpected_keys = list(hdf5_layer_names - model_layer_names) error_msgs = [] if len(missing_keys) > 0: logger.info( "Layers of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys) ) if len(unexpected_keys) > 0: logger.info( "Layers from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys) ) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) ) if output_loading_info: loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs} return model, loading_info return model def prepare_inputs_for_generation(self, inputs, **kwargs): return {"inputs": inputs} def _use_cache(self, outputs, use_cache): """During generation, decide whether to pass the `past` variable to the next forward pass.""" if len(outputs) <= 1 or use_cache is False: return False if hasattr(self.config, "mem_len") and self.config.mem_len == 0: return False return True def generate( self, input_ids=None, max_length=None, min_length=None, do_sample=None, early_stopping=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, bad_words_ids=None, bos_token_id=None, pad_token_id=None, eos_token_id=None, length_penalty=None, no_repeat_ngram_size=None, num_return_sequences=None, attention_mask=None, decoder_start_token_id=None, use_cache=None, ): r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling and beam-search. Adapted in part from `Facebook's XLM beam search code`_. .. _`Facebook's XLM beam search code`: https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 Parameters: input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)` The sequence used as a prompt for the generation. If `None` the method initializes it as an empty `tf.Tensor` of shape `(1,)`. max_length: (`optional`) int The max length of the sequence to be generated. Between 1 and infinity. Default to 20. min_length: (`optional`) int The min length of the sequence to be generated. Between 0 and infinity. Default to 0. do_sample: (`optional`) bool If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. early_stopping: (`optional`) bool if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. num_beams: (`optional`) int Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. temperature: (`optional`) float The value used to module the next token probabilities. Must be strictely positive. Default to 1.0. top_k: (`optional`) int The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. top_p: (`optional`) float The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. repetition_penalty: (`optional`) float The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. bos_token_id: (`optional`) int Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist. pad_token_id: (`optional`) int Pad token. Defaults to pad_token_id as defined in the models config. eos_token_id: (`optional`) int EOS token. Defaults to eos_token_id as defined in the models config. length_penalty: (`optional`) float Exponential penalty to the length. Default to 1. no_repeat_ngram_size: (`optional`) int If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. bad_words_ids: (`optional`) list of lists of int `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. num_return_sequences: (`optional`) int The number of independently computed returned sequences for each element in the batch. Default to 1. attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids` Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. Defaults to `None`. `What are attention masks? <../glossary.html#attention-mask>`__ decoder_start_token_id=None: (`optional`) int If an encoder-decoder model starts decoding with a different token than BOS. Defaults to `None` and is changed to `BOS` later. use_cache: (`optional`) bool If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. Return: output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)` sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` Examples:: tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. outputs = model.generate(max_length=40) # do greedy decoding print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated """ # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError( "You tried to generate sequences with a model that does not have a LM Head." "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" ) max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping use_cache = use_cache if use_cache is not None else self.config.use_cache num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty no_repeat_ngram_size = ( no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size ) bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids num_return_sequences = ( num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences ) decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) if input_ids is not None: batch_size = shape_list(input_ids)[0] # overriden by the input batch_size else: batch_size = 1 assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(use_cache, bool), "`use_cache` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." assert temperature > 0, "`temperature` should be strictely positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0) ), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0) ), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictely positive." assert ( isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictely positive integer." assert ( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" if input_ids is None: assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) input_ids = tf.fill((batch_size, 1), bos_token_id) else: assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) elif attention_mask is None: attention_mask = tf.ones_like(input_ids) if pad_token_id is None and eos_token_id is not None: logger.warning( "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) ) pad_token_id = eos_token_id # current position and vocab size cur_len = shape_list(input_ids)[1] vocab_size = self.config.vocab_size # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if self.config.is_encoder_decoder: if decoder_start_token_id is None: decoder_start_token_id = bos_token_id assert ( decoder_start_token_id is not None ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) # get encoder and store encoder outputs encoder = self.get_encoder() encoder_outputs = encoder(input_ids, attention_mask=attention_mask) # Expand input ids if num_beams > 1 or num_return_sequences > 1 if num_return_sequences > 1 or num_beams > 1: input_ids_len = shape_list(input_ids)[-1] input_ids = tf.broadcast_to( tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) attention_mask = tf.broadcast_to( tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) input_ids = tf.reshape( input_ids, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) attention_mask = tf.reshape( attention_mask, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) if self.config.is_encoder_decoder: # create empty decoder_input_ids input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id cur_len = 1 assert ( batch_size == encoder_outputs[0].shape[0] ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) expanded_batch_idxs = tf.reshape( tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), shape=(-1,), ) # expand encoder_outputs encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0), *encoder_outputs[1:]) else: encoder_outputs = None cur_len = shape_list(input_ids)[-1] if num_beams > 1: output = self._generate_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, eos_token_id=eos_token_id, decoder_start_token_id=decoder_start_token_id, batch_size=effective_batch_size, num_return_sequences=num_return_sequences, length_penalty=length_penalty, num_beams=num_beams, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, ) else: output = self._generate_no_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, eos_token_id=eos_token_id, decoder_start_token_id=decoder_start_token_id, batch_size=effective_batch_size, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, ) return output def _generate_no_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, decoder_start_token_id, batch_size, vocab_size, encoder_outputs, attention_mask, use_cache, ): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ # length of generated sentences / unfinished sentences unfinished_sents = tf.ones_like(input_ids[:, 0]) sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache ) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: next_token_logits_penalties = _create_next_token_logits_penalties( input_ids, next_token_logits, repetition_penalty ) next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) # create banned_tokens boolean mask banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) next_token_logits = set_tensor_by_indices_to_value( next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) next_token_logits = set_tensor_by_indices_to_value( next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: # create eos_token_id boolean mask is_token_logit_eos_token = tf.convert_to_tensor( [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool ) eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size]) next_token_logits = set_tensor_by_indices_to_value( next_token_logits, eos_token_indices_mask, -float("inf") ) if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature # Top-p/top-k filtering next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) # Sample next_token = tf.squeeze( tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 ) else: # Greedy decoding next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) # update generations and finished sentences if eos_token_id is not None: # pad finished sentences if eos_token_id exist tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) else: tokens_to_add = next_token # add token and increase length by one input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) cur_len = cur_len + 1 if eos_token_id is not None: eos_in_sents = tokens_to_add == eos_token_id # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( unfinished_sents, tf.cast(eos_in_sents, tf.int32) ) sent_lengths = ( sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + cur_len * is_sents_unfinished_and_token_to_add_is_eos ) # unfinished_sents is set to zero if eos in sentence unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos # stop when there is a in each sentence, or if we exceed the maximul length if tf.math.reduce_max(unfinished_sents) == 0: break # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = tf.concat( [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 ) # if there are different sentences lengths in the batch, some batches have to be padded min_sent_length = tf.math.reduce_min(sent_lengths) max_sent_length = tf.math.reduce_max(sent_lengths) if min_sent_length != max_sent_length: assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" # finished sents are filled with pad_token padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id # create length masks for tf.where operation broad_casted_sent_lengths = tf.broadcast_to( tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] ) broad_casted_range = tf.transpose( tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) ) decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) else: decoded = input_ids return decoded def _generate_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, early_stopping, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, decoder_start_token_id, eos_token_id, batch_size, num_return_sequences, length_penalty, num_beams, vocab_size, encoder_outputs, attention_mask, use_cache, ): """ Generate sequences for each example with beam search. """ # generated hypotheses generated_hyps = [ BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) for _ in range(batch_size) ] # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times if do_sample is False: beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32) beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9) beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1) else: beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32) beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,)) # cache compute states past = encoder_outputs # done sentences done = [False for _ in range(batch_size)] while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache ) outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: next_token_logits_penalties = _create_next_token_logits_penalties( input_ids, next_token_logits, repetition_penalty ) next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature # calculate log softmax score scores = tf.nn.log_softmax(next_token_logits, axis=-1) # (batch_size * num_beams, vocab_size) # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: # create eos_token_id boolean mask num_batch_hypotheses = batch_size * num_beams is_token_logit_eos_token = tf.convert_to_tensor( [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool ) eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size]) scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf")) if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 num_batch_hypotheses = batch_size * num_beams banned_tokens = calc_banned_ngram_tokens( input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len ) # create banned_tokens boolean mask banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) scores = set_tensor_by_indices_to_value( scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) banned_tokens_indices_mask = [] for banned_tokens_slice in banned_tokens: banned_tokens_indices_mask.append( [True if token in banned_tokens_slice else False for token in range(vocab_size)] ) scores = set_tensor_by_indices_to_value( scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") ) assert shape_list(scores) == [batch_size * num_beams, vocab_size] if do_sample: _scores = scores + tf.broadcast_to( beam_scores[:, None], (batch_size * num_beams, vocab_size) ) # (batch_size * num_beams, vocab_size) # Top-p/top-k filtering _scores = tf_top_k_top_p_filtering( _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 ) # (batch_size * num_beams, vocab_size) # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size)) next_tokens = tf.random.categorical( _scores, dtype=tf.int32, num_samples=2 * num_beams ) # (batch_size, 2 * num_beams) # Compute next scores next_scores = tf.gather(_scores, next_tokens, batch_dims=1) # (batch_size, 2 * num_beams) # sort the sampled vector to make sure that the first num_beams samples are the best next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1) next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) else: # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) next_scores = scores + tf.broadcast_to( beam_scores[:, None], (batch_size * num_beams, vocab_size) ) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) next_scores = tf.reshape( next_scores, (batch_size, num_beams * vocab_size) ) # (batch_size, num_beams * vocab_size) next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True) assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams] # next batch beam content next_batch_beam = [] # for each sentence for batch_idx in range(batch_size): # if we are done with this sentence if done[batch_idx]: assert ( len(generated_hyps[batch_idx]) >= num_beams ), "Batch can only be done if at least {} beams have been generated".format(num_beams) assert ( eos_token_id is not None and pad_token_id is not None ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue # next sentence beam content next_sent_beam = [] # next tokens for this sentence for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens[batch_idx], next_scores[batch_idx]) ): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size effective_beam_id = batch_idx * num_beams + beam_id # add to generated hypotheses if end of sentence or last iteration if (eos_token_id is not None) and (token_id.numpy() == eos_token_id): # if beam_token does not belong to top num_beams tokens, it should not be added is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams if is_beam_token_worse_than_top_num_beams: continue generated_hyps[batch_idx].add( tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy() ) else: # add next predicted token if it is not eos_token next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) # the beam for next step is full if len(next_sent_beam) == num_beams: break # Check if were done so that we can save a pad step if all(done) done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len=cur_len ) # update next beam content assert len(next_sent_beam) == num_beams, "Beam should always be full" next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == num_beams * (batch_idx + 1) # stop when we are done with each sentence if all(done): break # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32) beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32) beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32) # re-order batch and update current length input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx]) input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1) cur_len = cur_len + 1 # re-order internal states if past is not None: past = self._reorder_cache(past, beam_idx) # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = tf.concat( [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 ) # finalize all open beam hypotheses and end to generated hypotheses for batch_idx in range(batch_size): # Add all open beam hypothesis to generated_hyps if done[batch_idx]: continue # test that beam scores match previously calculated scores if not eos and batch_idx not done if eos_token_id is not None and all( (token_id % vocab_size).numpy().item() is not eos_token_id for token_id in next_tokens[batch_idx] ): assert tf.reduce_all( next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] ) # need to add best num_beams hypotheses to generated hyps for beam_id in range(num_beams): effective_beam_id = batch_idx * num_beams + beam_id final_score = beam_scores[effective_beam_id].numpy().item() final_tokens = input_ids[effective_beam_id] generated_hyps[batch_idx].add(final_tokens, final_score) # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch output_batch_size = batch_size if do_sample else batch_size * num_return_sequences output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences # select the best hypotheses sent_lengths_list = [] best = [] # retrieve best hypotheses for i, hypotheses in enumerate(generated_hyps): sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) for j in range(output_num_return_sequences_per_batch): best_hyp = sorted_hyps.pop()[1] sent_lengths_list.append(len(best_hyp)) best.append(best_hyp) assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format( output_batch_size, len(best) ) sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32) # shorter batches are filled with pad_token if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy(): assert pad_token_id is not None, "`Pad_token_id` has to be defined" sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length) decoded_list = [] # fill with hypothesis and eos_token_id if necessary for i, hypo in enumerate(best): assert sent_lengths[i] == shape_list(hypo)[0] # if sent_length is max_len do not pad if sent_lengths[i] == sent_max_len: decoded_slice = hypo else: # else pad to sent_max_len num_pad_tokens = sent_max_len - sent_lengths[i] padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32) decoded_slice = tf.concat([hypo, padding], axis=-1) # finish sentence with EOS token if sent_lengths[i] < max_length: decoded_slice = tf.where( tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i], eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32), decoded_slice, ) # add to list decoded_list.append(decoded_slice) decoded = tf.stack(decoded_list) else: # none of the hypotheses have an eos_token assert (len(hypo) == max_length for hypo in best) decoded = tf.stack(best) return decoded @staticmethod def _reorder_cache(past, beam_idx): return tuple(tf.gather(layer_past, beam_idx, axis=1) for layer_past in past) def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): # create logit penalties for already seen input_ids token_penalties = np.ones(shape_list(logits)) prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] for i, prev_input_id in enumerate(prev_input_ids): logit_penalized = logits[i].numpy()[prev_input_id] logit_penalties = np.zeros(logit_penalized.shape) # if previous logit score is < 0 then multiply repetition penalty else divide logit_penalties[logit_penalized < 0] = repetition_penalty logit_penalties[logit_penalized > 0] = 1 / repetition_penalty np.put(token_penalties[i], prev_input_id, logit_penalties) return tf.convert_to_tensor(token_penalties, dtype=tf.float32) def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): # Copied from fairseq for no_repeat_ngram in beam_search""" if cur_len + 1 < no_repeat_ngram_size: # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): gen_tokens = prev_input_ids[idx].numpy().tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - no_repeat_ngram_size ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] return banned_tokens def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): banned_tokens = [] def _tokens_match(prev_tokens, tokens): if len(tokens) == 0: # if bad word tokens is just one token always ban it return True if len(tokens) > len(prev_input_ids): # if bad word tokens are longer then prev input_ids they can't be equal return False if prev_tokens[-len(tokens) :] == tokens: # if tokens match return True else: return False for prev_input_ids_slice in prev_input_ids: banned_tokens_slice = [] for banned_token_seq in bad_words_ids: assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( bad_words_ids ) if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: # if tokens do not match continue continue banned_tokens_slice.append(banned_token_seq[-1]) banned_tokens.append(banned_tokens_slice) return banned_tokens def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) if top_k > 0: keep only top k tokens with highest probability (top-k filtering). if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) Make sure we keep at least min_tokens_to_keep per batch example in the output From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ logits_shape = shape_list(logits) if top_k > 0: top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) if top_p < 1.0: sorted_indices = tf.argsort(logits, direction="DESCENDING") sorted_logits = tf.gather( logits, sorted_indices, axis=-1, batch_dims=1 ) # expects logits to be of dim (batch_size, vocab_size) cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) sorted_indices_to_remove = tf.concat( [ tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), sorted_indices_to_remove[:, min_tokens_to_keep:], ], -1, ) # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1) sorted_indices_to_remove = tf.concat( [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1, ) # scatter sorted tensors to original indexing indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) return logits def scatter_values_on_batch_indices(values, batch_indices): shape = shape_list(batch_indices) # broadcast batch dim to shape broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) # transform batch_indices to pair_indices pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) # scatter values to pair indices return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) def set_tensor_by_indices_to_value(tensor, indices, value): # create value_tensor since tensor value assignment is not possible in TF value_tensor = tf.zeros_like(tensor) + value return tf.where(indices, value_tensor, tensor) class BeamHypotheses(object): def __init__(self, num_beams, max_length, length_penalty, early_stopping): """ Initialize n-best list of hypotheses. """ self.max_length = max_length - 1 # ignoring bos_token self.length_penalty = length_penalty self.early_stopping = early_stopping self.num_beams = num_beams self.beams = [] self.worst_score = 1e9 def __len__(self): """ Number of hypotheses in the list. """ return len(self.beams) def add(self, hyp, sum_logprobs): """ Add a new hypothesis to the list. """ score = sum_logprobs / len(hyp) ** self.length_penalty if len(self) < self.num_beams or score > self.worst_score: self.beams.append((score, hyp)) if len(self) > self.num_beams: sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) del self.beams[sorted_scores[0][1]] self.worst_score = sorted_scores[1][0] else: self.worst_score = min(score, self.worst_score) def is_done(self, best_sum_logprobs, cur_len=None): """ If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst one in the heap, then we are done with this sentence. """ if len(self) < self.num_beams: return False elif self.early_stopping: return True else: if cur_len is None: cur_len = self.max_length cur_score = best_sum_logprobs / cur_len ** self.length_penalty ret = self.worst_score >= cur_score return ret class TFConv1D(tf.keras.layers.Layer): def __init__(self, nf, nx, initializer_range=0.02, **kwargs): """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) Basically works like a Linear layer but the weights are transposed """ super().__init__(**kwargs) self.nf = nf self.nx = nx self.initializer_range = initializer_range def build(self, input_shape): self.weight = self.add_weight( "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) ) self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) def call(self, x): bz, sl = shape_list(x)[:2] x = tf.reshape(x, [-1, self.nx]) x = tf.matmul(x, self.weight) + self.bias x = tf.reshape(x, [bz, sl, self.nf]) return x class TFSharedEmbeddings(tf.keras.layers.Layer): """Construct shared token embeddings. """ def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs): super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range def build(self, input_shape): """Build shared token embedding layer Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ self.weight = self.add_weight( "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) ) super().build(input_shape) def call(self, inputs, mode="embedding"): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": return self._embedding(inputs) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) def _embedding(self, input_ids): """Applies embedding based on inputs tensor.""" return tf.gather(self.weight, input_ids) def _linear(self, inputs): """Computes logits by running inputs through a linear layer. Args: inputs: A float32 tensor with shape [..., hidden_size] Returns: float32 tensor with shape [..., vocab_size]. """ first_dims = shape_list(inputs)[:-1] x = tf.reshape(inputs, [-1, self.hidden_size]) logits = tf.matmul(x, self.weight, transpose_b=True) return tf.reshape(logits, first_dims + [self.vocab_size]) class TFSequenceSummary(tf.keras.layers.Layer): r""" Compute a single vector summary of a sequence hidden states according to various possibilities: Args of the config class: summary_type: - 'last' => [default] take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj: Add a projection after the vector extraction summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ def __init__(self, config, initializer_range=0.02, **kwargs): super().__init__(**kwargs) self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" if self.summary_type == "attn": # We should use a standard multi-head attention module with absolute positional embedding for that. # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # We can probably just use the multi-head attention module of PyTorch >=1.1.0 raise NotImplementedError self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj if self.has_summary: if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: num_classes = config.num_labels else: num_classes = config.hidden_size self.summary = tf.keras.layers.Dense( num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" ) self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh" if self.has_activation: self.activation = tf.keras.activations.tanh self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 if self.has_first_dropout: self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 if self.has_last_dropout: self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) def call(self, inputs, training=False): """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer. cls_index: [optional] position of the classification token if summary_type == 'cls_index', shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. if summary_type == 'cls_index' and cls_index is None: we take the last token of the sequence as classification token """ if not isinstance(inputs, (dict, tuple, list)): hidden_states = inputs cls_index = None elif isinstance(inputs, (tuple, list)): hidden_states = inputs[0] cls_index = inputs[1] if len(inputs) > 1 else None assert len(inputs) <= 2, "Too many inputs." else: hidden_states = inputs.get("hidden_states") cls_index = inputs.get("cls_index", None) if self.summary_type == "last": output = hidden_states[:, -1] elif self.summary_type == "first": output = hidden_states[:, 0] elif self.summary_type == "mean": output = tf.reduce_mean(hidden_states, axis=1) elif self.summary_type == "cls_index": hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims] if cls_index is None: cls_index = tf.fill( hidden_shape[:-2], hidden_shape[-2] - 1 ) # A tensor full of shape [batch] or [batch, num choices] full of sequence length cls_shape = shape_list(cls_index) if len(cls_shape) <= len(hidden_shape) - 2: cls_index = cls_index[..., tf.newaxis] # else: # cls_index = cls_index[..., tf.newaxis] # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2) output = tf.squeeze( output, axis=len(hidden_shape) - 2 ) # shape of output: (batch, num choices, hidden_size) elif self.summary_type == "attn": raise NotImplementedError if self.has_first_dropout: output = self.first_dropout(output, training=training) if self.has_summary: output = self.summary(output) if self.has_activation: output = self.activation(output) if self.has_last_dropout: output = self.last_dropout(output, training=training) return output def shape_list(x): """Deal with dynamic shape in tensorflow cleanly.""" static = x.shape.as_list() dynamic = tf.shape(x) return [dynamic[i] if s is None else s for i, s in enumerate(static)] def get_initializer(initializer_range=0.02): """Creates a `tf.initializers.truncated_normal` with the given range. Args: initializer_range: float, initializer range for stddev. Returns: TruncatedNormal initializer with stddev = `initializer_range`. """ return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_xlm.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 XLM model. """ import itertools import logging import math import numpy as np import tensorflow as tf from .configuration_xlm import XLMConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlm-mlm-en-2048", "xlm-mlm-ende-1024", "xlm-mlm-enfr-1024", "xlm-mlm-enro-1024", "xlm-mlm-tlm-xnli15-1024", "xlm-mlm-xnli15-1024", "xlm-clm-enfr-1024", "xlm-clm-ende-1024", "xlm-mlm-17-1280", "xlm-mlm-100-1280", # See all XLM models at https://huggingface.co/models?filter=xlm ] def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2])) out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2])) def gelu(x): """ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32): """ Generate hidden states mask, and optionally an attention mask. """ bs = shape_list(lengths)[0] if padding_mask is not None: mask = padding_mask else: # assert lengths.max().item() <= slen alen = tf.range(slen) mask = tf.math.less(alen, lengths[:, tf.newaxis]) # attention mask is the same as mask, or triangular inferior attention (causal) if causal: attn_mask = tf.less_equal( tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis] ) else: attn_mask = mask # sanity check # assert shape_list(mask) == [bs, slen] tf.debugging.assert_equal(shape_list(mask), [bs, slen]) assert causal is False or shape_list(attn_mask) == [bs, slen, slen] mask = tf.cast(mask, dtype=dtype) attn_mask = tf.cast(attn_mask, dtype=dtype) return mask, attn_mask class TFMultiHeadAttention(tf.keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config, **kwargs): super().__init__(**kwargs) self.layer_id = next(TFMultiHeadAttention.NEW_ID) self.output_attentions = config.output_attentions self.dim = dim self.n_heads = n_heads assert self.dim % self.n_heads == 0 self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError def call(self, inputs, training=False): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ input, mask, kv, cache, head_mask = inputs # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = shape_list(input) if kv is None: klen = qlen if cache is None else cache["slen"] + qlen else: klen = shape_list(kv)[1] # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) n_heads = self.n_heads dim_per_head = self.dim // n_heads mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen) def shape(x): """ projection """ return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) def unshape(x): """ compute context """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) scores = scores - 1e30 * (1.0 - mask) weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) outputs = (self.out_lin(context),) if self.output_attentions: outputs = outputs + (weights,) return outputs class TFTransformerFFN(tf.keras.layers.Layer): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): super().__init__(**kwargs) self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu self.dropout = tf.keras.layers.Dropout(config.dropout) def call(self, input, training=False): x = self.lin1(input) x = self.act(x) x = self.lin2(x) x = self.dropout(x, training=training) return x class TFXLMMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states # encoder / decoder, output layer self.is_encoder = config.is_encoder self.is_decoder = not config.is_encoder if self.is_decoder: raise NotImplementedError("Currently XLM can only be used as an encoder") # self.with_output = with_output self.causal = config.causal # dictionary / languages self.n_langs = config.n_langs self.use_lang_emb = config.use_lang_emb self.n_words = config.n_words self.eos_index = config.eos_index self.pad_index = config.pad_index # self.dico = dico # self.id2lang = config.id2lang # self.lang2id = config.lang2id # assert len(self.dico) == self.n_words # assert len(self.id2lang) == len(self.lang2id) == self.n_langs # model parameters self.dim = config.emb_dim # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default self.n_heads = config.n_heads # 8 by default self.n_layers = config.n_layers assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" # embeddings self.dropout = tf.keras.layers.Dropout(config.dropout) self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, self.dim, embeddings_initializer=get_initializer(config.embed_init_std), name="position_embeddings", ) if config.sinusoidal_embeddings: raise NotImplementedError # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) if config.n_langs > 1 and config.use_lang_emb: self.lang_embeddings = tf.keras.layers.Embedding( self.n_langs, self.dim, embeddings_initializer=get_initializer(config.embed_init_std), name="lang_embeddings", ) self.embeddings = TFSharedEmbeddings( self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings" ) # padding_idx=self.pad_index) self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") # transformer layers self.attentions = [] self.layer_norm1 = [] self.ffns = [] self.layer_norm2 = [] # if self.is_decoder: # self.layer_norm15 = [] # self.encoder_attn = [] for i in range(self.n_layers): self.attentions.append( TFMultiHeadAttention(self.n_heads, self.dim, config=config, name="attentions_._{}".format(i)) ) self.layer_norm1.append( tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1_._{}".format(i)) ) # if self.is_decoder: # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) self.ffns.append( TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name="ffns_._{}".format(i)) ) self.layer_norm2.append( tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2_._{}".format(i)) ) if hasattr(config, "pruned_heads"): pruned_heads = config.pruned_heads.copy().items() config.pruned_heads = {} for layer, heads in pruned_heads: if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) def get_input_embeddings(self): return self.embeddings def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError def call( self, inputs, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, training=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask langs = inputs[2] if len(inputs) > 2 else langs token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids lengths = inputs[5] if len(inputs) > 5 else lengths cache = inputs[6] if len(inputs) > 6 else cache head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) langs = inputs.get("langs", langs) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) lengths = inputs.get("lengths", lengths) cache = inputs.get("cache", cache) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: bs, slen = shape_list(input_ids) elif inputs_embeds is not None: bs, slen = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if lengths is None: if input_ids is not None: lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) else: lengths = tf.convert_to_tensor([slen] * bs, tf.int32) # mask = input_ids != self.pad_index # check inputs # assert shape_list(lengths)[0] == bs tf.debugging.assert_equal(shape_list(lengths)[0], bs) # assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # position_ids if position_ids is None: position_ids = tf.expand_dims(tf.range(slen), axis=0) else: # assert shape_list(position_ids) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: # assert shape_list(langs) == [bs, slen] # (slen, bs) tf.debugging.assert_equal(shape_list(langs), [bs, slen]) # langs = langs.transpose(0, 1) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layers # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids) if langs is not None and self.use_lang_emb and self.n_langs > 1: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = self.dropout(tensor, training=training) tensor = tensor * mask[..., tf.newaxis] # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) tensor = tensor * mask[..., tf.newaxis] # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) class TFXLMPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLMConfig base_model_prefix = "transformer" @property def dummy_inputs(self): # Sometimes XLM has language embeddings so don't forget to build them as well if needed inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) if self.config.use_lang_emb and self.config.n_langs > 1: langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) else: langs_list = None return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} XLM_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.XLMConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLM_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are languages ids which can be obtained from the language names by using two conversion mappings provided in the configuration of the model (only provided for multilingual models). More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). See usage examples detailed in the `multilingual documentation `__. token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`): dictionary with ``tf.Tensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", XLM_START_DOCSTRING, ) class TFXLMModel(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMModel tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMModel.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs class TFXLMPredLayer(tf.keras.layers.Layer): """ Prediction layer (cross_entropy or adaptive_softmax). """ def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.asm = config.asm self.n_words = config.n_words self.pad_index = config.pad_index if config.asm is False: self.input_embeddings = input_embeddings else: raise NotImplementedError # self.proj = nn.AdaptiveLogSoftmaxWithLoss( # in_features=dim, # n_classes=config.n_words, # cutoffs=config.asm_cutoffs, # div_value=config.asm_div_value, # head_bias=True, # default is False # ) def build(self, input_shape): # The output weights are the same as the input embeddings, but there is an output-only bias for each token. self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @add_start_docstrings( """The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLM_START_DOCSTRING, ) class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") def get_output_embeddings(self): return self.pred_layer.input_embeddings def prepare_inputs_for_generation(self, inputs, **kwargs): mask_token_id = self.config.mask_token_id lang_id = self.config.lang_id effective_batch_size = inputs.shape[0] mask_token = tf.ones((effective_batch_size, 1), dtype=tf.int32) * mask_token_id inputs = tf.concat([inputs, mask_token], axis=1) if lang_id is not None: langs = tf.ones_like(inputs) * lang_id else: langs = None return {"inputs": inputs, "langs": langs} @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMWithLMHeadModel tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] outputs = self.pred_layer(output) outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_START_DOCSTRING, ) class TFXLMForSequenceClassification(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLMMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMForSequenceClassification tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.constant([1])[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] logits = self.sequence_summary(output) outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLM_START_DOCSTRING, ) class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLMTokenizer, TFXLMForQuestionAnsweringSimple tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] """ transformer_outputs = self.transformer(inputs, **kwargs) sequence_output = transformer_outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + transformer_outputs[ 1: ] # Keep mems, hidden states, attentions if there are in it return outputs # start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 XLM-RoBERTa model. """ import logging from .configuration_xlm_roberta import XLMRobertaConfig from .file_utils import add_start_docstrings from .modeling_tf_roberta import ( TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification, TFRobertaModel, ) logger = logging.getLogger(__name__) TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta ] XLM_ROBERTA_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaModel(TFRobertaModel): """ This class overrides :class:`~transformers1.TFRobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM): """ This class overrides :class:`~transformers1.TFRobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification): """ This class overrides :class:`~transformers1.TFRobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification): """ This class overrides :class:`~transformers1.TFRobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_tf_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 XLNet model. """ import logging import numpy as np import tensorflow as tf from .configuration_xlnet import XLNetConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, keras_serializable, shape_list, ) from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlnet-base-cased", "xlnet-large-cased", # See all XLNet models at https://huggingface.co/models?filter=xlnet ] def gelu(x): """ Implementation of the gelu activation function. XLNet is using OpenAI GPT's gelu Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf def swish(x): return x * tf.sigmoid(x) ACT2FN = { "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.layers.Activation(swish), } class TFXLNetRelativeAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions if config.d_model % config.n_head != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.d_model, config.n_head) ) self.n_head = config.n_head self.d_head = config.d_head self.d_model = config.d_model self.scale = 1 / (config.d_head ** 0.5) self.initializer_range = config.initializer_range self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): initializer = get_initializer(self.initializer_range) self.q = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q" ) self.k = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k" ) self.v = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v" ) self.o = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o" ) self.r = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r" ) self.r_r_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" ) self.r_s_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias" ) self.r_w_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" ) self.seg_embed = self.add_weight( shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" ) super().build(input_shape) def prune_heads(self, heads): raise NotImplementedError def rel_shift(self, x, klen=-1): """perform relative shift to form the relative attention score.""" x_size = shape_list(x) x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3])) x = x[1:, ...] x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3])) x = x[:, 0:klen, :, :] # x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long)) return x def rel_attn_core(self, inputs, training=False): """Core relative positional attention operations.""" q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs # content based attention score ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h) # position based attention score bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r) bd = self.rel_shift(bd, klen=shape_list(ac)[1]) # segment based attention score if seg_mat is None: ef = 0 else: ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed) ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef) # merge attention scores and perform masking attn_score = (ac + bd + ef) * self.scale if attn_mask is not None: # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask if attn_mask.dtype == tf.float16: attn_score = attn_score - 65500 * attn_mask else: attn_score = attn_score - 1e30 * attn_mask # attention probability attn_prob = tf.nn.softmax(attn_score, axis=1) attn_prob = self.dropout(attn_prob, training=training) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * head_mask # attention output attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h) if self.output_attentions: return attn_vec, attn_prob return attn_vec def post_attention(self, inputs, residual=True, training=False): """Post-attention processing.""" # post-attention projection (back to `d_model`) h, attn_vec = inputs attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o) attn_out = self.dropout(attn_out, training=training) if residual: attn_out = attn_out + h output = self.layer_norm(attn_out) return output def call(self, inputs, training=False): (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs if g is not None: # Two-stream attention with relative positional encoding. # content based attention score if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) else: cat = h # content-based key head k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k) # content-based value head v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v) # position-based key head k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r) # h-stream # content-stream query head q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q) # core attention ops attn_vec_h = self.rel_attn_core( [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training ) if self.output_attentions: attn_vec_h, attn_prob_h = attn_vec_h # post processing output_h = self.post_attention([h, attn_vec_h], training=training) # g-stream # query-stream query head q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q) # core attention ops if target_mapping is not None: q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g # post processing output_g = self.post_attention([g, attn_vec_g], training=training) if self.output_attentions: attn_prob = attn_prob_h, attn_prob_g else: # Multi-head attention with relative positional encoding if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) else: cat = h # content heads q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q) k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k) v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v) # positional heads k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r) # core attention ops attn_vec = self.rel_attn_core( [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training ) if self.output_attentions: attn_vec, attn_prob = attn_vec # post processing output_h = self.post_attention([h, attn_vec], training=training) output_g = None outputs = (output_h, output_g) if self.output_attentions: outputs = outputs + (attn_prob,) return outputs class TFXLNetFeedForward(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_1 = tf.keras.layers.Dense( config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1" ) self.layer_2 = tf.keras.layers.Dense( config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2" ) self.dropout = tf.keras.layers.Dropout(config.dropout) if isinstance(config.ff_activation, str): self.activation_function = ACT2FN[config.ff_activation] else: self.activation_function = config.ff_activation def call(self, inp, training=False): output = inp output = self.layer_1(output) output = self.activation_function(output) output = self.dropout(output, training=training) output = self.layer_2(output) output = self.dropout(output, training=training) output = self.layer_norm(output + inp) return output class TFXLNetLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn") self.ff = TFXLNetFeedForward(config, name="ff") self.dropout = tf.keras.layers.Dropout(config.dropout) def call(self, inputs, training=False): outputs = self.rel_attn(inputs, training=training) output_h, output_g = outputs[:2] if output_g is not None: output_g = self.ff(output_g, training=training) output_h = self.ff(output_h, training=training) outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there return outputs class TFXLNetLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias return hidden_states @keras_serializable class TFXLNetMainLayer(tf.keras.layers.Layer): config_class = XLNetConfig def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.mem_len = config.mem_len self.reuse_len = config.reuse_len self.d_model = config.d_model self.same_length = config.same_length self.attn_type = config.attn_type self.bi_data = config.bi_data self.clamp_len = config.clamp_len self.n_layer = config.n_layer self.use_bfloat16 = config.use_bfloat16 self.initializer_range = config.initializer_range self.word_embedding = TFSharedEmbeddings( config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding" ) self.layer = [TFXLNetLayer(config, name="layer_._{}".format(i)) for i in range(config.n_layer)] self.dropout = tf.keras.layers.Dropout(config.dropout) def get_input_embeddings(self): return self.word_embedding def build(self, input_shape): initializer = get_initializer(self.initializer_range) self.mask_emb = self.add_weight( shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb" ) def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError def _prune_heads(self, heads_to_prune): raise NotImplementedError def create_mask(self, qlen, mlen, dtype=tf.float32): """ Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. Args: qlen: TODO Lysandre didn't fill mlen: TODO Lysandre didn't fill :: same_length=False: same_length=True: < qlen > < qlen > ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1] qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1] [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1] v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0] """ attn_mask = tf.ones([qlen, qlen], dtype=dtype) mask_u = tf.matrix_band_part(attn_mask, 0, -1) mask_dia = tf.matrix_band_part(attn_mask, 0, 0) attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype) ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) if self.same_length: mask_l = tf.matrix_band_part(attn_mask, -1, 0) ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1) return ret def cache_mem(self, curr_out, prev_mem): """cache hidden states into memory.""" if self.reuse_len is not None and self.reuse_len > 0: curr_out = curr_out[: self.reuse_len] if prev_mem is None: new_mem = curr_out[-self.mem_len :] else: new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len :] return tf.stop_gradient(new_mem) @staticmethod def positional_embedding(pos_seq, inv_freq, bsz=None): sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq) pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1) pos_emb = pos_emb[:, None, :] if bsz is not None: pos_emb = tf.tile(pos_emb, [1, bsz, 1]) return pos_emb def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=None): """create relative positional encoding.""" freq_seq = tf.range(0, self.d_model, 2.0) if dtype is not None and dtype != tf.float32: freq_seq = tf.cast(freq_seq, dtype=dtype) inv_freq = 1 / (10000 ** (freq_seq / self.d_model)) if self.attn_type == "bi": # beg, end = klen - 1, -qlen beg, end = klen, -qlen elif self.attn_type == "uni": # beg, end = klen - 1, -1 beg, end = klen, -1 else: raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) if self.bi_data: fwd_pos_seq = tf.range(beg, end, -1.0) bwd_pos_seq = tf.range(-beg, -end, 1.0) if dtype is not None and dtype != tf.float32: fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype) bwd_pos_seq = tf.cast(bwd_pos_seq, dtype=dtype) if self.clamp_len > 0: fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len) bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len) if bsz is not None: # With bi_data, the batch size should be divisible by 2. assert bsz % 2 == 0 fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2) else: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1) else: fwd_pos_seq = tf.range(beg, end, -1.0) if dtype is not None and dtype != tf.float32: fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype) if self.clamp_len > 0: fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len) pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz) return pos_emb def call( self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, training=False, ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask mems = inputs[2] if len(inputs) > 2 else mems perm_mask = inputs[3] if len(inputs) > 3 else perm_mask target_mapping = inputs[4] if len(inputs) > 4 else target_mapping token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids input_mask = inputs[6] if len(inputs) > 6 else input_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds use_cache = inputs[9] if len(inputs) > 9 else use_cache assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) mems = inputs.get("mems", mems) perm_mask = inputs.get("perm_mask", perm_mask) target_mapping = inputs.get("target_mapping", target_mapping) token_type_ids = inputs.get("token_type_ids", token_type_ids) input_mask = inputs.get("input_mask", input_mask) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) use_cache = inputs.get("use_cache", use_cache) assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = tf.transpose(input_ids, perm=(1, 0)) qlen, bsz = shape_list(input_ids)[:2] elif inputs_embeds is not None: inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2)) qlen, bsz = shape_list(inputs_embeds)[:2] else: raise ValueError("You have to specify either input_ids or inputs_embeds") token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32 # Attention mask # causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = attn_mask[:, :, None, None] elif self.attn_type == "bi": attn_mask = None else: raise ValueError("Unsupported attention type: {}".format(self.attn_type)) # data mask: input mask & perm mask assert input_mask is None or attention_mask is None, ( "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." ) if input_mask is None and attention_mask is not None: input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float) if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: data_mask = input_mask[None] elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # all mems can be attended to if mlen > 0: mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], dtype=dtype_float) data_mask = tf.concat([mems_mask, data_mask], axis=1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] else: attn_mask += data_mask[:, :, :, None] if attn_mask is not None: attn_mask = tf.cast(attn_mask > 0, dtype=dtype_float) if attn_mask is not None: non_tgt_mask = -tf.eye(qlen, dtype=dtype_float) if mlen > 0: non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=dtype_float), non_tgt_mask], axis=-1) non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=dtype_float) else: non_tgt_mask = None # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k, training=training) if target_mapping is not None: word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1]) # else: # We removed the inp_q input which was same as target mapping # inp_q_ext = inp_q[:, :, None] # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k output_g = self.dropout(word_emb_q, training=training) else: output_g = None # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32) cat_ids = tf.concat([mem_pad, token_type_ids], 0) else: cat_ids = token_type_ids # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = tf.cast(tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])), tf.int32) seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float) else: seg_mat = None # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float) pos_emb = self.dropout(pos_emb, training=training) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: raise NotImplementedError else: head_mask = [None] * self.n_layer new_mems = () if mems is None: mems = [None] * len(self.layer) attentions = [] hidden_states = [] for i, layer_module in enumerate(self.layer): # cache new mems if self.mem_len is not None and self.mem_len > 0 and use_cache is True: new_mems = new_mems + (self.cache_mem(output_h, mems[i]),) if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) outputs = layer_module( [output_h, output_g, non_tgt_mask, attn_mask, pos_emb, seg_mat, mems[i], target_mapping, head_mask[i]], training=training, ) output_h, output_g = outputs[:2] if self.output_attentions: attentions.append(outputs[2]) # Add last hidden state if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h, training=training) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) outputs = (tf.transpose(output, perm=(1, 0, 2)),) if self.mem_len is not None and self.mem_len > 0 and use_cache is True: outputs = outputs + (new_mems,) if self.output_hidden_states: if output_g is not None: hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs) else: hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states) outputs = outputs + (hidden_states,) if self.output_attentions: attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) outputs = outputs + (attentions,) return outputs # outputs, (new_mems), (hidden_states), (attentions) class TFXLNetPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLNetConfig base_model_prefix = "transformer" XLNET_START_DOCSTRING = r""" .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers1.XLNetConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLNET_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.XLNetTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. perm_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``: If ``perm_mask[k, i, j] = 0``, i attend to j in batch k; if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k. If None, each token attends to all the others (full bidirectional attention). Only used during pretraining (to define factorization order) or for sequential decoding (generation). target_mapping (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the output tokens to use. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding (generation). token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ input_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding. Kept for compatibility with the original code base. You can only uses one of `input_mask` and `attention_mask` Mask values selected in ``[0, 1]``: ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED. head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. use_cache (:obj:`bool`): If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. """ @add_start_docstrings( "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.", XLNET_START_DOCSTRING, ) class TFXLNetModel(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetModel tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetModel.from_pretrained('xlnet-large-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.transformer(inputs, **kwargs) return outputs @add_start_docstrings( """XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLNET_START_DOCSTRING, ) class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss") def get_output_embeddings(self): return self.lm_loss.input_embeddings def prepare_inputs_for_generation(self, inputs, past, **kwargs): # Add dummy token at the end (no attention on this one) effective_batch_size = inputs.shape[0] dummy_token = tf.zeros((effective_batch_size, 1), dtype=tf.int32) inputs = tf.concat([inputs, dummy_token], axis=1) # Build permutation mask so that previous tokens don't see last token sequence_length = inputs.shape[1] perm_mask = tf.zeros((effective_batch_size, sequence_length, sequence_length - 1), dtype=tf.float32) perm_mask_seq_end = tf.ones((effective_batch_size, sequence_length, 1), dtype=tf.float32) perm_mask = tf.concat([perm_mask, perm_mask_seq_end], axis=-1) # We'll only predict the last token target_mapping = tf.zeros((effective_batch_size, 1, sequence_length - 1), dtype=tf.float32) target_mapping_seq_end = tf.ones((effective_batch_size, 1, 1), dtype=tf.float32) target_mapping = tf.concat([target_mapping, target_mapping_seq_end], axis=-1) inputs = { "inputs": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "use_cache": kwargs["use_cache"], } # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf import numpy as np from transformers1 import XLNetTokenizer, TFXLNetLMHeadModel tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased') # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = tf.constant(tokenizer.encode("Hello, my dog is very ", add_special_tokens=True))[None, :] # We will predict the masked token perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1])) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32)) next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ transformer_outputs = self.transformer(inputs, **kwargs) hidden_state = transformer_outputs[0] logits = self.lm_loss(hidden_state) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it return outputs # return logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLNET_START_DOCSTRING, ) class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary( config, initializer_range=config.initializer_range, name="sequence_summary" ) self.logits_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetForSequenceClassification tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it return outputs # return logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLNET_START_DOCSTRING, ) class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name="transformer") self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): Classification scores (before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetForTokenClassification tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] logits = self.classifier(output) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it return outputs # return logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLNET_START_DOCSTRING, ) class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers1 import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] """ transformer_outputs = self.transformer(inputs, **kwargs) sequence_output = transformer_outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + transformer_outputs[ 1: ] # Keep mems, hidden states, attentions if there are in it return outputs # start_logits, end_logits, (mems), (hidden_states), (attentions) # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # the hidden-states output to compute `span start logits` and `span end logits`). """, # XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) # class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel): # r""" # Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: # **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)`` # Log probabilities for the top config.start_n_top start token possibilities (beam-search). # **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)`` # Indices for the top config.start_n_top start token possibilities (beam-search). # **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` # Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). # **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` # Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). # **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) # ``tf.Tensor`` of shape ``(batch_size,)`` # Log probabilities for the ``is_impossible`` label of the answers. # **mems**: # list of ``tf.Tensor`` (one for each layer): # that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model # if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. # See details in the docstring of the `mems` input above. # **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) # list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) # of shape ``(batch_size, sequence_length, hidden_size)``: # Hidden-states of the model at the output of each layer plus the initial embedding outputs. # **attentions**: (`optional`, returned when ``config.output_attentions=True``) # list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: # Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. # Examples:: # # For example purposes. Not runnable. # tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') # model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') # input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 # start_positions = tf.constant([1]) # end_positions = tf.constant([3]) # outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) # loss, start_scores, end_scores = outputs[:2] # """ # def __init__(self, config, *inputs, **kwargs): # super().__init__(config, *inputs, **kwargs) # self.start_n_top = config.start_n_top # self.end_n_top = config.end_n_top # self.transformer = TFXLNetMainLayer(config, name='transformer') # self.start_logits = TFPoolerStartLogits(config, name='start_logits') # self.end_logits = TFPoolerEndLogits(config, name='end_logits') # self.answer_class = TFPoolerAnswerClass(config, name='answer_class') # def call(self, inputs, training=False): # transformer_outputs = self.transformer(inputs, training=training) # hidden_states = transformer_outputs[0] # start_logits = self.start_logits(hidden_states, p_mask=p_mask) # outputs = transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it # if start_positions is not None and end_positions is not None: # # If we are on multi-GPU, let's remove the dimension added by batch splitting # for x in (start_positions, end_positions, cls_index, is_impossible): # if x is not None and x.dim() > 1: # x.squeeze_(-1) # # during training, compute the end logits based on the ground truth of the start position # end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) # loss_fct = CrossEntropyLoss() # start_loss = loss_fct(start_logits, start_positions) # end_loss = loss_fct(end_logits, end_positions) # total_loss = (start_loss + end_loss) / 2 # if cls_index is not None and is_impossible is not None: # # Predict answerability from the representation of CLS and START # cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) # loss_fct_cls = nn.BCEWithLogitsLoss() # cls_loss = loss_fct_cls(cls_logits, is_impossible) # # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss # total_loss += cls_loss * 0.5 # outputs = (total_loss,) + outputs # else: # # during inference, compute the end logits based on beam search # bsz, slen, hsz = hidden_states.size() # start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) # start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top) # start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) # start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) # start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) # hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz) # p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None # end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) # end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) # end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top) # end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) # end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) # start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) # get the representation of START as weighted sum of hidden states # cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) # Shape (batch size,): one single `cls_logits` for each sample # outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs # # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits # # or (if labels are provided) (total_loss,) # return outputs ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py """ import logging import torch import torch.nn as nn import torch.nn.functional as F from .configuration_transfo_xl import TransfoXLConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax from .modeling_utils import PreTrainedModel logger = logging.getLogger(__name__) TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "transfo-xl-wt103", # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl ] def build_tf_to_pytorch_map(model, config): """ A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible. """ tf_to_pt_map = {} if hasattr(model, "transformer"): # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax tf_to_pt_map.update( { "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight, "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias, } ) for i, (out_l, proj_l, tie_proj) in enumerate( zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs) ): layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i if config.tie_weight: tf_to_pt_map.update({layer_str + "b": out_l.bias}) else: raise NotImplementedError # I don't think this is implemented in the TF code tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias}) if not tie_proj: tf_to_pt_map.update({layer_str + "proj": proj_l}) # Now load the rest of the transformer model = model.transformer # Embeddings for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)): layer_str = "transformer/adaptive_embed/cutoff_%d/" % i tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l}) # Transformer blocks for i, b in enumerate(model.layers): layer_str = "transformer/layer_%d/" % i tf_to_pt_map.update( { layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias, layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight, layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight, layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight, layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight, layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias, layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight, layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias, layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight, layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias, } ) # Relative positioning biases if config.untie_r: r_r_list = [] r_w_list = [] for b in model.layers: r_r_list.append(b.dec_attn.r_r_bias) r_w_list.append(b.dec_attn.r_w_bias) else: r_r_list = [model.r_r_bias] r_w_list = [model.r_w_bias] tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list}) return tf_to_pt_map def load_tf_weights_in_transfo_xl(model, config, tf_path): """ Load tf checkpoints in a pytorch model """ try: import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise # Build TF to PyTorch weights loading map tf_to_pt_map = build_tf_to_pytorch_map(model, config) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) tf_weights[name] = array for name, pointer in tf_to_pt_map.items(): assert name in tf_weights array = tf_weights[name] # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if "kernel" in name or "proj" in name: array = np.transpose(array) if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1: # Here we will split the TF weights assert len(pointer) == array.shape[0] for i, p_i in enumerate(pointer): arr_i = array[i, ...] try: assert p_i.shape == arr_i.shape except AssertionError as e: e.args += (p_i.shape, arr_i.shape) raise logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) p_i.data = torch.from_numpy(arr_i) else: try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) tf_weights.pop(name, None) tf_weights.pop(name + "/Adam", None) tf_weights.pop(name + "/Adam_1", None) logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) return model class PositionalEmbedding(nn.Module): def __init__(self, demb): super().__init__() self.demb = demb inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) self.register_buffer("inv_freq", inv_freq) def forward(self, pos_seq, bsz=None): sinusoid_inp = torch.ger(pos_seq, self.inv_freq) pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) if bsz is not None: return pos_emb[:, None, :].expand(-1, bsz, -1) else: return pos_emb[:, None, :] class PositionwiseFF(nn.Module): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5): super().__init__() self.d_model = d_model self.d_inner = d_inner self.dropout = dropout self.CoreNet = nn.Sequential( nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), nn.Dropout(dropout), nn.Linear(d_inner, d_model), nn.Dropout(dropout), ) self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) self.pre_lnorm = pre_lnorm def forward(self, inp): if self.pre_lnorm: # layer normalization + positionwise feed-forward core_out = self.CoreNet(self.layer_norm(inp)) # residual connection output = core_out + inp else: # positionwise feed-forward core_out = self.CoreNet(inp) # residual connection + layer normalization output = self.layer_norm(inp + core_out) return output class RelPartialLearnableMultiHeadAttn(nn.Module): def __init__( self, n_head, d_model, d_head, dropout, dropatt=0, tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False, layer_norm_epsilon=1e-5, ): super().__init__() self.output_attentions = output_attentions self.n_head = n_head self.d_model = d_model self.d_head = d_head self.dropout = dropout self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) self.drop = nn.Dropout(dropout) self.dropatt = nn.Dropout(dropatt) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) self.scale = 1 / (d_head ** 0.5) self.pre_lnorm = pre_lnorm if r_r_bias is None or r_w_bias is None: # Biases are not shared self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) else: self.r_r_bias = r_r_bias self.r_w_bias = r_w_bias self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) def _rel_shift(self, x): zero_pad_shape = (x.size(0), 1) + x.size()[2:] zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype) x_padded = torch.cat([zero_pad, x], dim=1) x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:] x_padded = x_padded.view(*x_padded_shape) x = x_padded[1:].view_as(x) return x def forward(self, w, r, attn_mask=None, mems=None, head_mask=None): qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) if mems is not None: cat = torch.cat([mems, w], 0) if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(cat)) else: w_heads = self.qkv_net(cat) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) w_head_q = w_head_q[-qlen:] else: if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(w)) else: w_heads = self.qkv_net(w) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) klen = w_head_k.size(0) w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head # compute attention score rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head rr_head_q = w_head_q + self.r_r_bias BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head BD = self._rel_shift(BD) # [qlen x klen x bsz x n_head] attn_score = AC + BD attn_score.mul_(self.scale) # compute attention probability if attn_mask is not None and torch.sum(attn_mask).item(): attn_mask = attn_mask == 1 # Switch to bool if attn_mask.dim() == 2: if next(self.parameters()).dtype == torch.float16: attn_score = ( attn_score.float().masked_fill(attn_mask[None, :, :, None], -65000).type_as(attn_score) ) else: attn_score = attn_score.float().masked_fill(attn_mask[None, :, :, None], -1e30).type_as(attn_score) elif attn_mask.dim() == 3: if next(self.parameters()).dtype == torch.float16: attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -65000).type_as(attn_score) else: attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score) # [qlen x klen x bsz x n_head] attn_prob = F.softmax(attn_score, dim=1) attn_prob = self.dropatt(attn_prob) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * head_mask # compute attention vector attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v)) # [qlen x bsz x n_head x d_head] attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) # linear projection attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out) if self.pre_lnorm: # residual connection outputs = [w + attn_out] else: # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] if self.output_attentions: outputs.append(attn_prob) return outputs class RelPartialLearnableDecoderLayer(nn.Module): def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs): super().__init__() self.dec_attn = RelPartialLearnableMultiHeadAttn( n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs ) self.pos_ff = PositionwiseFF( d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon ) def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None): attn_outputs = self.dec_attn(dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask) ff_output = self.pos_ff(attn_outputs[0]) outputs = [ff_output] + attn_outputs[1:] return outputs class AdaptiveEmbedding(nn.Module): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False): super().__init__() self.n_token = n_token self.d_embed = d_embed self.cutoffs = cutoffs + [n_token] self.div_val = div_val self.d_proj = d_proj self.emb_scale = d_proj ** 0.5 self.cutoff_ends = [0] + self.cutoffs self.emb_layers = nn.ModuleList() self.emb_projs = nn.ParameterList() if div_val == 1: self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0)) if d_proj != d_embed: self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i)) self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) def forward(self, inp): if self.div_val == 1: embed = self.emb_layers[0](inp) if self.d_proj != self.d_embed: embed = F.linear(embed, self.emb_projs[0]) else: param = next(self.parameters()) inp_flat = inp.view(-1) emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) indices_i = mask_i.nonzero().squeeze() if indices_i.numel() == 0: continue inp_i = inp_flat.index_select(0, indices_i) - l_idx emb_i = self.emb_layers[i](inp_i) emb_i = F.linear(emb_i, self.emb_projs[i]) emb_flat.index_copy_(0, indices_i, emb_i) embed_shape = inp.size() + (self.d_proj,) embed = emb_flat.view(embed_shape) embed.mul_(self.emb_scale) return embed class TransfoXLPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = TransfoXLConfig load_tf_weights = load_tf_weights_in_transfo_xl base_model_prefix = "transformer" def _init_weight(self, weight): if self.config.init == "uniform": nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) elif self.config.init == "normal": nn.init.normal_(weight, 0.0, self.config.init_std) def _init_bias(self, bias): nn.init.constant_(bias, 0.0) def _init_weights(self, m): """ Initialize the weights. """ classname = m.__class__.__name__ if classname.find("Linear") != -1: if hasattr(m, "weight") and m.weight is not None: self._init_weight(m.weight) if hasattr(m, "bias") and m.bias is not None: self._init_bias(m.bias) elif classname.find("AdaptiveEmbedding") != -1: if hasattr(m, "emb_projs"): for i in range(len(m.emb_projs)): if m.emb_projs[i] is not None: nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std) elif classname.find("Embedding") != -1: if hasattr(m, "weight"): self._init_weight(m.weight) elif classname.find("ProjectedAdaptiveLogSoftmax") != -1: if hasattr(m, "cluster_weight") and m.cluster_weight is not None: self._init_weight(m.cluster_weight) if hasattr(m, "cluster_bias") and m.cluster_bias is not None: self._init_bias(m.cluster_bias) if hasattr(m, "out_projs"): for i in range(len(m.out_projs)): if m.out_projs[i] is not None: nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std) elif classname.find("LayerNorm") != -1: if hasattr(m, "weight"): nn.init.normal_(m.weight, 1.0, self.config.init_std) if hasattr(m, "bias") and m.bias is not None: self._init_bias(m.bias) else: if hasattr(m, "r_emb"): self._init_weight(m.r_emb) if hasattr(m, "r_w_bias"): self._init_weight(m.r_w_bias) if hasattr(m, "r_r_bias"): self._init_weight(m.r_r_bias) if hasattr(m, "r_bias"): self._init_bias(m.r_bias) TRANSFO_XL_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.TransfoXLConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ TRANSFO_XL_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.TransfoXLTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", TRANSFO_XL_START_DOCSTRING, ) class TransfoXLModel(TransfoXLPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.n_token = config.vocab_size self.d_embed = config.d_embed self.d_model = config.d_model self.n_head = config.n_head self.d_head = config.d_head self.word_emb = AdaptiveEmbedding( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val ) self.drop = nn.Dropout(config.dropout) self.n_layer = config.n_layer self.tgt_len = config.tgt_len self.mem_len = config.mem_len self.ext_len = config.ext_len self.max_klen = config.tgt_len + config.ext_len + config.mem_len self.attn_type = config.attn_type if not config.untie_r: self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.layers = nn.ModuleList() if config.attn_type == 0: # the default attention for i in range(config.n_layer): self.layers.append( RelPartialLearnableDecoderLayer( config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, r_w_bias=None if config.untie_r else self.r_w_bias, r_r_bias=None if config.untie_r else self.r_r_bias, output_attentions=self.output_attentions, layer_norm_epsilon=config.layer_norm_epsilon, ) ) else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints raise NotImplementedError # Removed them to avoid maintaining dead code self.same_length = config.same_length self.clamp_len = config.clamp_len if self.attn_type == 0: # default attention self.pos_emb = PositionalEmbedding(self.d_model) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint self.init_weights() def get_input_embeddings(self): return self.word_emb def set_input_embeddings(self, new_embeddings): self.word_emb = new_embeddings def backward_compatible(self): self.sample_softmax = -1 def reset_length(self, tgt_len, ext_len, mem_len): self.tgt_len = tgt_len self.mem_len = mem_len self.ext_len = ext_len def _prune_heads(self, heads): logger.info("Head pruning is not implemented for Transformer-XL model") pass def init_mems(self, bsz): if self.mem_len > 0: mems = [] param = next(self.parameters()) for i in range(self.n_layer): empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device) mems.append(empty) return mems else: return None def _update_mems(self, hids, mems, mlen, qlen): # does not deal with None if mems is None: return None # mems is not None assert len(hids) == len(mems), "len(hids) != len(mems)" # There are `mlen + qlen` steps that can be cached into mems # For the next step, the last `ext_len` of the `qlen` tokens # will be used as the extended context. Hence, we only cache # the tokens from `mlen + qlen - self.ext_len - self.mem_len` # to `mlen + qlen - self.ext_len`. with torch.no_grad(): new_mems = [] end_idx = mlen + max(0, qlen - 0 - self.ext_len) beg_idx = max(0, end_idx - self.mem_len) for i in range(len(hids)): cat = torch.cat([mems[i], hids[i]], dim=0) new_mems.append(cat[beg_idx:end_idx].detach()) return new_mems @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import TransfoXLTokenizer, TransfoXLModel import torch tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLModel.from_pretrained('transfo-xl-wt103') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states, mems = outputs[:2] """ # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = input_ids.transpose(0, 1).contiguous() qlen, bsz = input_ids.size() elif inputs_embeds is not None: inputs_embeds = inputs_embeds.transpose(0, 1).contiguous() qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if mems is None: mems = self.init_mems(bsz) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0) head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) head_mask = head_mask.to( dtype=next(self.parameters()).dtype ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer if inputs_embeds is not None: word_emb = inputs_embeds else: word_emb = self.word_emb(input_ids) mlen = mems[0].size(0) if mems is not None else 0 klen = mlen + qlen if self.same_length: all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) mask_len = klen - self.mem_len if mask_len > 0: mask_shift_len = qlen - mask_len else: mask_shift_len = qlen dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 else: dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[ :, :, None ] hids = [] attentions = [] if self.attn_type == 0: # default pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype) if self.clamp_len > 0: pos_seq.clamp_(max=self.clamp_len) pos_emb = self.pos_emb(pos_seq) core_out = self.drop(word_emb) pos_emb = self.drop(pos_emb) for i, layer in enumerate(self.layers): hids.append(core_out) mems_i = None if mems is None else mems[i] layer_outputs = layer( core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i, head_mask=head_mask[i] ) core_out = layer_outputs[0] if self.output_attentions: attentions.append(layer_outputs[1]) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint core_out = self.drop(core_out) new_mems = self._update_mems(hids, mems, mlen, qlen) # We transpose back here to shape [bsz, len, hidden_dim] outputs = [core_out.transpose(0, 1).contiguous(), new_mems] if self.output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) hids = list(t.transpose(0, 1).contiguous() for t in hids) outputs.append(hids) if self.output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions) outputs.append(attentions) return outputs # last hidden state, new_mems, (all hidden states), (all attentions) @add_start_docstrings( """The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive input embeddings)""", TRANSFO_XL_START_DOCSTRING, ) class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = TransfoXLModel(config) self.sample_softmax = config.sample_softmax assert ( self.sample_softmax <= 0 ), "Sampling from the softmax is not implemented yet. Please look at issue: #3310: https://github.com/huggingface/transformers/issues/3310" self.crit = ProjectedAdaptiveLogSoftmax( config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val ) self.init_weights() def tie_weights(self): """ Run this to be sure output and input (adaptive) softmax weights are tied """ if self.config.tie_weight: for i in range(len(self.crit.out_layers)): self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i]) if self.config.tie_projs: for i, tie_proj in enumerate(self.config.tie_projs): if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: if self.config.torchscript: self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone()) else: self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] elif tie_proj and self.config.div_val != 1: if self.config.torchscript: self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone()) else: self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] def reset_length(self, tgt_len, ext_len, mem_len): self.transformer.reset_length(tgt_len, ext_len, mem_len) def init_mems(self, bsz): return self.transformer.init_mems(bsz) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, labels=None): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.TransfoXLConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import TransfoXLTokenizer, TransfoXLLMHeadModel import torch tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, mems = outputs[:2] """ if input_ids is not None: bsz, tgt_len = input_ids.size(0), input_ids.size(1) elif inputs_embeds is not None: bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1) else: raise ValueError("You have to specify either input_ids or inputs_embeds") transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask, inputs_embeds=inputs_embeds) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] outputs = transformer_outputs[1:] softmax_output = self.crit(pred_hid, labels) if labels is None: softmax_output = softmax_output.view(bsz, tgt_len, -1) outputs = [softmax_output] + outputs else: softmax_output = softmax_output.view(bsz, tgt_len - 1) outputs = [softmax_output, None] + outputs return outputs # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions) def get_output_embeddings(self): """ Double-check if you are using adaptive softmax. """ if self.sample_softmax > 0: return self.out_layer else: return self.crit.out_layers[-1] def prepare_inputs_for_generation(self, input_ids, past, **model_kwargs): inputs = {"input_ids": input_ids} # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_transfo_xl_utilities.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl. """ import torch import torch.nn as nn import torch.nn.functional as F # CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) # CUDA_MINOR = int(torch.version.cuda.split('.')[1]) class ProjectedAdaptiveLogSoftmax(nn.Module): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False): super().__init__() self.n_token = n_token self.d_embed = d_embed self.d_proj = d_proj self.cutoffs = cutoffs + [n_token] self.cutoff_ends = [0] + self.cutoffs self.div_val = div_val self.shortlist_size = self.cutoffs[0] self.n_clusters = len(self.cutoffs) - 1 self.head_size = self.shortlist_size + self.n_clusters if self.n_clusters > 0: self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) self.out_layers = nn.ModuleList() self.out_projs = nn.ParameterList() if div_val == 1: for i in range(len(self.cutoffs)): if d_proj != d_embed: self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) else: self.out_projs.append(None) self.out_layers.append(nn.Linear(d_embed, n_token)) else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx)) self.keep_order = keep_order def _compute_logit(self, hidden, weight, bias, proj): if proj is None: logit = F.linear(hidden, weight, bias=bias) else: # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: proj_hid = F.linear(hidden, proj.t().contiguous()) logit = F.linear(proj_hid, weight, bias=bias) # else: # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) # if bias is not None: # logit = logit + bias return logit def forward(self, hidden, labels=None, keep_order=False): """ Params: hidden :: [len*bsz x d_proj] labels :: [len*bsz] Return: if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out :: [(len-1)*bsz] Negative log likelihood We could replace this implementation by the native PyTorch one if their's had an option to set bias on all clusters in the native one. here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138 """ if labels is not None: # Shift so that tokens < n predict n hidden = hidden[..., :-1, :].contiguous() labels = labels[..., 1:].contiguous() hidden = hidden.view(-1, hidden.size(-1)) labels = labels.view(-1) if hidden.size(0) != labels.size(0): raise RuntimeError("Input and labels should have the same size " "in the batch dimension.") else: hidden = hidden.view(-1, hidden.size(-1)) if self.n_clusters == 0: logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) if labels is not None: out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1) else: out = F.log_softmax(logit, dim=-1) else: # construct weights and biases weights, biases = [], [] for i in range(len(self.cutoffs)): if self.div_val == 1: l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] weight_i = self.out_layers[0].weight[l_idx:r_idx] bias_i = self.out_layers[0].bias[l_idx:r_idx] else: weight_i = self.out_layers[i].weight bias_i = self.out_layers[i].bias if i == 0: weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) weights.append(weight_i) biases.append(bias_i) head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) head_logprob = F.log_softmax(head_logit, dim=1) if labels is None: out = hidden.new_empty((head_logit.size(0), self.n_token)) else: out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device) offset = 0 cutoff_values = [0] + self.cutoffs for i in range(len(cutoff_values) - 1): l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] if labels is not None: mask_i = (labels >= l_idx) & (labels < r_idx) indices_i = mask_i.nonzero().squeeze() if indices_i.numel() == 0: continue target_i = labels.index_select(0, indices_i) - l_idx head_logprob_i = head_logprob.index_select(0, indices_i) hidden_i = hidden.index_select(0, indices_i) else: hidden_i = hidden if i == 0: if labels is not None: logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1) else: out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] else: weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster if labels is not None: logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather( 1, target_i[:, None] ).squeeze(1) else: logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i out[:, l_idx:r_idx] = logprob_i if labels is not None: if (hasattr(self, "keep_order") and self.keep_order) or keep_order: out.index_copy_(0, indices_i, -logprob_i) else: out[offset : offset + logprob_i.size(0)].copy_(-logprob_i) offset += logprob_i.size(0) return out def log_prob(self, hidden): r""" Computes log probabilities for all :math:`n\_classes` From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py Args: hidden (Tensor): a minibatch of examples Returns: log-probabilities of for each class :math:`c` in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. Shape: - Input: :math:`(N, in\_features)` - Output: :math:`(N, n\_classes)` """ if self.n_clusters == 0: logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) return F.log_softmax(logit, dim=-1) else: # construct weights and biases weights, biases = [], [] for i in range(len(self.cutoffs)): if self.div_val == 1: l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] weight_i = self.out_layers[0].weight[l_idx:r_idx] bias_i = self.out_layers[0].bias[l_idx:r_idx] else: weight_i = self.out_layers[i].weight bias_i = self.out_layers[i].bias if i == 0: weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) weights.append(weight_i) biases.append(bias_i) head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) out = hidden.new_empty((head_logit.size(0), self.n_token)) head_logprob = F.log_softmax(head_logit, dim=1) cutoff_values = [0] + self.cutoffs for i in range(len(cutoff_values) - 1): start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1] if i == 0: out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] else: weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i) tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) logprob_i = head_logprob[:, -i] + tail_logprob_i out[:, start_idx, stop_idx] = logprob_i return out ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_utils.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect import logging import os from typing import Callable, Dict, Iterable, List, Optional, Tuple import torch from torch import Tensor, device, dtype, nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F from .activations import get_activation from .configuration_utils import PretrainedConfig from .file_utils import ( DUMMY_INPUTS, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url, ) logger = logging.getLogger(__name__) try: from torch.nn import Identity except ImportError: # Older PyTorch compatibility class Identity(nn.Module): r"""A placeholder identity operator that is argument-insensitive. """ def __init__(self, *args, **kwargs): super().__init__() def forward(self, input): return input class ModuleUtilsMixin: """ A few utilities for torch.nn.Modules, to be used as a mixin. """ def num_parameters(self, only_trainable: bool = False) -> int: """ Get number of (optionally, trainable) parameters in the module. """ params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters() return sum(p.numel() for p in params) @staticmethod def _hook_rss_memory_pre_forward(module, *args, **kwargs): try: import psutil except (ImportError): raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") process = psutil.Process(os.getpid()) mem = process.memory_info() module.mem_rss_pre_forward = mem.rss return None @staticmethod def _hook_rss_memory_post_forward(module, *args, **kwargs): try: import psutil except (ImportError): raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") process = psutil.Process(os.getpid()) mem = process.memory_info() module.mem_rss_post_forward = mem.rss mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0) return None def add_memory_hooks(self): """ Add a memory hook before and after each sub-module forward pass to record increase in memory consumption. Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()` """ for module in self.modules(): module.register_forward_pre_hook(self._hook_rss_memory_pre_forward) module.register_forward_hook(self._hook_rss_memory_post_forward) self.reset_memory_hooks_state() def reset_memory_hooks_state(self): for module in self.modules(): module.mem_rss_diff = 0 module.mem_rss_post_forward = 0 module.mem_rss_pre_forward = 0 @property def device(self) -> device: """ Get torch.device from module, assuming that the whole module has one device. """ try: return next(self.parameters()).device except StopIteration: # For nn.DataParallel compatibility in PyTorch 1.5 def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] return tuples gen = self._named_members(get_members_fn=find_tensor_attributes) first_tuple = next(gen) return first_tuple[1].device @property def dtype(self) -> dtype: """ Get torch.dtype from module, assuming that the whole module has one dtype. """ try: return next(self.parameters()).dtype except StopIteration: # For nn.DataParallel compatibility in PyTorch 1.5 def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] return tuples gen = self._named_members(get_members_fn=find_tensor_attributes) first_tuple = next(gen) return first_tuple[1].dtype def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: """type: torch.Tensor -> torch.Tensor""" if encoder_attention_mask.dim() == 3: encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow # /transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = (encoder_extended_attention_mask == # encoder_extended_attention_mask.transpose(-1, -2)) encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility if self.dtype == torch.float16: encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 elif self.dtype == torch.float32: encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: raise ValueError( "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format( self.dtype ) ) return encoder_extended_attention_mask def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple, device: device) -> Tensor: """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored. Arguments: attention_mask: torch.Tensor with 1 indicating tokens to ATTEND to input_shape: tuple, shape of input_ids device: torch.Device, usually self.device Returns: torch.Tensor with dtype of attention_mask.dtype """ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder: batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.to(attention_mask.dtype) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( input_shape, attention_mask.shape ) ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def get_head_mask(self, head_mask: Tensor, num_hidden_layers: int, is_attention_chunked: bool = False) -> Tensor: """ # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head attention_probs has shape bsz x n_heads x N x N Arguments: head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads] num_hidden_layers: int Returns: Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] or list with [None] for each layer """ if head_mask is not None: head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) else: head_mask = [None] * num_hidden_layers return head_mask def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" head_mask = head_mask.to(dtype=self.dtype) # switch to fload if need + fp16 compatibility return head_mask class PreTrainedModel(nn.Module, ModuleUtilsMixin): r""" Base class for all models. :class:`~transformers1.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. Class attributes (overridden by derived classes): - ``config_class``: a class derived from :class:`~transformers1.PretrainedConfig` to use as configuration class for this model architecture. - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: - ``model``: an instance of the relevant subclass of :class:`~transformers1.PreTrainedModel`, - ``config``: an instance of the relevant subclass of :class:`~transformers1.PretrainedConfig`, - ``path``: a path (string) to the TensorFlow checkpoint. - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. """ config_class = None base_model_prefix = "" @property def dummy_inputs(self): """ Dummy inputs to do a forward pass in the network. Returns: torch.Tensor with dummy inputs """ return {"input_ids": torch.tensor(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): super().__init__() if not isinstance(config, PretrainedConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " "To create a model from a pretrained model use " "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ ) ) # Save config in model self.config = config @property def base_model(self): return getattr(self, self.base_model_prefix, self) def get_input_embeddings(self): """ Returns the model's input embeddings. Returns: :obj:`nn.Module`: A torch module mapping vocabulary to hidden states. """ base_model = getattr(self, self.base_model_prefix, self) if base_model is not self: return base_model.get_input_embeddings() else: raise NotImplementedError def set_input_embeddings(self, value: nn.Module): """ Set model's input embeddings Args: value (:obj:`nn.Module`): A module mapping vocabulary to hidden states. """ base_model = getattr(self, self.base_model_prefix, self) if base_model is not self: base_model.set_input_embeddings(value) else: raise NotImplementedError def get_output_embeddings(self): """ Returns the model's output embeddings. Returns: :obj:`nn.Module`: A torch module mapping hidden states to vocabulary. """ return None # Overwrite for models with output embeddings def tie_weights(self): """ Tie the weights between the input embeddings and the output embeddings. If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the weights instead. """ output_embeddings = self.get_output_embeddings() if output_embeddings is not None: self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) def _tie_or_clone_weights(self, output_embeddings, input_embeddings): """ Tie or clone module weights depending of whether we are using TorchScript or not """ if self.config.torchscript: output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone()) else: output_embeddings.weight = input_embeddings.weight if getattr(output_embeddings, "bias", None) is not None: output_embeddings.bias.data = torch.nn.functional.pad( output_embeddings.bias.data, (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],), "constant", 0, ) if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): output_embeddings.out_features = input_embeddings.num_embeddings def resize_token_embeddings(self, new_num_tokens: Optional[int] = None): """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Arguments: new_num_tokens: (`optional`) int: New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model. Return: ``torch.nn.Embeddings`` Pointer to the input tokens Embeddings Module of the model """ base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed model_embeds = base_model._resize_token_embeddings(new_num_tokens) if new_num_tokens is None: return model_embeds # Update base model and current model config self.config.vocab_size = new_num_tokens base_model.vocab_size = new_num_tokens # Tie weights again if needed self.tie_weights() return model_embeds def _resize_token_embeddings(self, new_num_tokens): old_embeddings = self.get_input_embeddings() new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) self.set_input_embeddings(new_embeddings) return self.get_input_embeddings() def _get_resized_embeddings( self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None ) -> torch.nn.Embedding: """ Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end Args: old_embeddings: ``torch.nn.Embedding`` Old embeddings to be resized. new_num_tokens: (`optional`) int New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end If not provided or None: return the provided token Embedding Module. Return: ``torch.nn.Embedding`` Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None """ if new_num_tokens is None: return old_embeddings old_num_tokens, old_embedding_dim = old_embeddings.weight.size() if old_num_tokens == new_num_tokens: return old_embeddings # Build new embeddings new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) new_embeddings.to(old_embeddings.weight.device) # initialize all new embeddings (in particular added tokens) self._init_weights(new_embeddings) # Copy token embeddings from the previous weights num_tokens_to_copy = min(old_num_tokens, new_num_tokens) new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] return new_embeddings def init_weights(self): """ Initialize and prunes weights if needed. """ # Initialize weights self.apply(self._init_weights) # Prune heads if needed if self.config.pruned_heads: self.prune_heads(self.config.pruned_heads) # Tie weights if needed self.tie_weights() def prune_heads(self, heads_to_prune: Dict): """ Prunes heads of the base model. Arguments: heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. """ # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads for layer, heads in heads_to_prune.items(): union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON self.base_model._prune_heads(heads_to_prune) def save_pretrained(self, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers1.PreTrainedModel.from_pretrained`` class method. Arguments: save_directory: directory to which to save. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # Only save the model itself if we are using distributed training model_to_save = self.module if hasattr(self, "module") else self # Attach architecture to the config model_to_save.config.architectures = [model_to_save.__class__.__name__] # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) if getattr(self.config, "xla_device", False): import torch_xla.core.xla_model as xm if xm.is_master_ordinal(): # Save configuration file model_to_save.config.save_pretrained(save_directory) # xm.save takes care of saving only from master xm.save(model_to_save.state_dict(), output_model_file) else: model_to_save.config.save_pretrained(save_directory) torch.save(model_to_save.state_dict(), output_model_file) logger.info("Model weights saved in {}".format(output_model_file)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with ``model.train()`` The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers1.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) one of: - an instance of a class derived from :class:`~transformers1.PretrainedConfig`, or - a string valid as input to :func:`~transformers1.PretrainedConfig.from_pretrained()` Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers1.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers1.PreTrainedModel.save_pretrained` and :func:`~transformers1.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers1.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: # For example purposes. Not runnable. model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) state_dict = kwargs.pop("state_dict", None) cache_dir = kwargs.pop("cache_dir", None) from_tf = kwargs.pop("from_tf", False) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) local_files_only = kwargs.pop("local_files_only", False) use_cdn = kwargs.pop("use_cdn", True) # Load config if we don't provide a configuration if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( config_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, **kwargs, ) else: model_kwargs = kwargs # Load model if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")): # Load from a TF 1.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): # Load from a TF 2.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: raise EnvironmentError( "Error no file named {} found in directory {} or `from_tf` set to False".format( [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], pretrained_model_name_or_path, ) ) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): assert ( from_tf ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( pretrained_model_name_or_path + ".index" ) archive_file = pretrained_model_name_or_path + ".index" else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME), use_cdn=use_cdn, ) try: # Load from URL or cache if already cached resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) if resolved_archive_file is None: raise EnvironmentError except EnvironmentError: msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) else: resolved_archive_file = None # Instantiate model. model = cls(config, *model_args, **model_kwargs) if state_dict is None and not from_tf: try: state_dict = torch.load(resolved_archive_file, map_location="cpu") except Exception: raise OSError( "Unable to load weights from pytorch checkpoint file. " "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. " ) missing_keys = [] unexpected_keys = [] error_msgs = [] if from_tf: if resolved_archive_file.endswith(".index"): # Load from a TensorFlow 1.X checkpoint - provided by original authors model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index' else: # Load from our TensorFlow 2.0 checkpoints try: from transformers import load_tf2_checkpoint_in_pytorch_model model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True) except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise else: # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if "gamma" in key: new_key = key.replace("gamma", "weight") if "beta" in key: new_key = key.replace("beta", "bias") if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants # so we need to apply the function recursively. def load(module: nn.Module, prefix=""): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs, ) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") # Make sure we are able to load base models as well as derived models (with heads) start_prefix = "" model_to_load = model has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()) if not hasattr(model, cls.base_model_prefix) and has_prefix_module: start_prefix = cls.base_model_prefix + "." if hasattr(model, cls.base_model_prefix) and not has_prefix_module: model_to_load = getattr(model, cls.base_model_prefix) load(model_to_load, prefix=start_prefix) if model.__class__.__name__ != model_to_load.__class__.__name__: base_model_state_dict = model_to_load.state_dict().keys() head_model_state_dict_without_base_prefix = [ key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys() ] missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, missing_keys ) ) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys ) ) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading state_dict for {}:\n\t{}".format( model.__class__.__name__, "\n\t".join(error_msgs) ) ) model.tie_weights() # make sure token embedding weights are still tied if needed # Set model in evaluation mode to deactivate DropOut modules by default model.eval() if output_loading_info: loading_info = { "missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs, } return model, loading_info if hasattr(config, "xla_device") and config.xla_device: import torch_xla.core.xla_model as xm model = xm.send_cpu_data_to_device(model, xm.xla_device()) model.to(xm.xla_device()) return model def prepare_inputs_for_generation(self, input_ids, **kwargs): return {"input_ids": input_ids} def prepare_logits_for_generation(self, logits, **kwargs): return logits def _use_cache(self, outputs, use_cache): """During generation, decide whether to pass the `past` variable to the next forward pass.""" if len(outputs) <= 1 or use_cache is False: return False if hasattr(self.config, "mem_len") and self.config.mem_len == 0: return False return True def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty): """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """ for i in range(batch_size * num_beams): for previous_token in set(prev_output_tokens[i].tolist()): # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability if lprobs[i, previous_token] < 0: lprobs[i, previous_token] *= repetition_penalty else: lprobs[i, previous_token] /= repetition_penalty @torch.no_grad() def generate( self, input_ids: Optional[torch.LongTensor] = None, max_length: Optional[int] = None, min_length: Optional[int] = None, do_sample: Optional[bool] = None, early_stopping: Optional[bool] = None, num_beams: Optional[int] = None, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None, bad_words_ids: Optional[Iterable[int]] = None, bos_token_id: Optional[int] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[int] = None, length_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, num_return_sequences: Optional[int] = None, attention_mask: Optional[torch.LongTensor] = None, decoder_start_token_id: Optional[int] = None, use_cache: Optional[bool] = None, **model_specific_kwargs ) -> torch.LongTensor: r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. Adapted in part from `Facebook's XLM beam search code`_. .. _`Facebook's XLM beam search code`: https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 Parameters: input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)` The sequence used as a prompt for the generation. If `None` the method initializes it as an empty `torch.LongTensor` of shape `(1,)`. max_length: (`optional`) int The max length of the sequence to be generated. Between `min_length` and infinity. Default to 20. min_length: (`optional`) int The min length of the sequence to be generated. Between 0 and infinity. Default to 0. do_sample: (`optional`) bool If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. early_stopping: (`optional`) bool if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. num_beams: (`optional`) int Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. temperature: (`optional`) float The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. top_k: (`optional`) int The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. top_p: (`optional`) float The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. repetition_penalty: (`optional`) float The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. pad_token_id: (`optional`) int Padding token. Default to specicic model pad_token_id or None if it does not exist. bos_token_id: (`optional`) int BOS token. Defaults to `bos_token_id` as defined in the models config. eos_token_id: (`optional`) int EOS token. Defaults to `eos_token_id` as defined in the models config. length_penalty: (`optional`) float Exponential penalty to the length. Default to 1. no_repeat_ngram_size: (`optional`) int If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. bad_words_ids: (`optional`) list of lists of int `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. num_return_sequences: (`optional`) int The number of independently computed returned sequences for each element in the batch. Default to 1. attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids` Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. Defaults to `None`. `What are attention masks? <../glossary.html#attention-mask>`__ decoder_start_token_id=None: (`optional`) int If an encoder-decoder model starts decoding with a different token than BOS. Defaults to `None` and is changed to `BOS` later. use_cache: (`optional`) bool If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. model_specific_kwargs: (`optional`) dict Additional model specific kwargs will be forwarded to the `forward` function of the model. Return: output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)` sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` Examples:: tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. outputs = model.generate(max_length=40) # do greedy decoding print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated """ # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError( "You tried to generate sequences with a model that does not have a LM Head." "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" ) max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping use_cache = use_cache if use_cache is not None else self.config.use_cache num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty no_repeat_ngram_size = ( no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size ) bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids num_return_sequences = ( num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences ) decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) if input_ids is not None: batch_size = input_ids.shape[0] # overriden by the input batch_size else: batch_size = 1 assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(use_cache, bool), "`use_cache` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." assert temperature > 0, "`temperature` should be strictly positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0) ), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0) ), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictly positive." assert ( isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 ), "`no_repeat_ngram_size` should be a positive integer." assert ( isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictly positive integer." assert ( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" if input_ids is None: assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) input_ids = torch.full( (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device, ) else: assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids): attention_mask = input_ids.ne(pad_token_id).long() elif attention_mask is None: attention_mask = input_ids.new_ones(input_ids.shape) # set pad_token_id to eos_token_id if not set. Important that this is done after # attention_mask is created if pad_token_id is None and eos_token_id is not None: logger.warning( "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) ) pad_token_id = eos_token_id # current position and vocab size if hasattr(self.config, "vocab_size"): vocab_size = self.config.vocab_size elif ( self.config.is_encoder_decoder and hasattr(self.config, "decoder") and hasattr(self.config.decoder, "vocab_size") ): vocab_size = self.config.decoder.vocab_size # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if self.config.is_encoder_decoder: if decoder_start_token_id is None: decoder_start_token_id = bos_token_id assert ( decoder_start_token_id is not None ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) # get encoder and store encoder outputs encoder = self.get_encoder() encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask) # Expand input ids if num_beams > 1 or num_return_sequences > 1 if num_return_sequences > 1 or num_beams > 1: input_ids_len = input_ids.shape[-1] input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len) attention_mask = attention_mask.unsqueeze(1).expand( batch_size, effective_batch_mult * num_beams, input_ids_len ) input_ids = input_ids.contiguous().view( effective_batch_size * num_beams, input_ids_len ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) attention_mask = attention_mask.contiguous().view( effective_batch_size * num_beams, input_ids_len ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) if self.config.is_encoder_decoder: # create empty decoder_input_ids input_ids = torch.full( (effective_batch_size * num_beams, 1), decoder_start_token_id, dtype=torch.long, device=next(self.parameters()).device, ) cur_len = 1 assert ( batch_size == encoder_outputs[0].shape[0] ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) expanded_batch_idxs = ( torch.arange(batch_size) .view(-1, 1) .repeat(1, num_beams * effective_batch_mult) .view(-1) .to(input_ids.device) ) # expand encoder_outputs encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:]) else: encoder_outputs = None cur_len = input_ids.shape[-1] if num_beams > 1: output = self._generate_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, decoder_start_token_id=decoder_start_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, num_return_sequences=num_return_sequences, length_penalty=length_penalty, num_beams=num_beams, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, model_specific_kwargs=model_specific_kwargs, ) else: output = self._generate_no_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, bos_token_id=bos_token_id, pad_token_id=pad_token_id, decoder_start_token_id=decoder_start_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, model_specific_kwargs=model_specific_kwargs, ) return output def _generate_no_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, decoder_start_token_id, batch_size, encoder_outputs, attention_mask, use_cache, model_specific_kwargs, ): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ # length of generated sentences / unfinished sentences unfinished_sents = input_ids.new(batch_size).fill_(1) sent_lengths = input_ids.new(batch_size).fill_(max_length) past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs ) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: self.enforce_repetition_penalty_(next_token_logits, batch_size, 1, input_ids, repetition_penalty) if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) for batch_idx in range(batch_size): next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) for batch_idx in range(batch_size): next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: next_token_logits[:, eos_token_id] = -float("inf") if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature # Top-p/top-k filtering next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) # Sample probs = F.softmax(next_token_logits, dim=-1) next_token = torch.multinomial(probs, num_samples=1).squeeze(1) else: # Greedy decoding next_token = torch.argmax(next_token_logits, dim=-1) # update generations and finished sentences if eos_token_id is not None: # pad finished sentences if eos_token_id exist tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) else: tokens_to_add = next_token # add token and increase length by one input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) cur_len = cur_len + 1 if eos_token_id is not None: eos_in_sents = tokens_to_add == eos_token_id # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len) # unfinished_sents is set to zero if eos in sentence unfinished_sents.mul_((~eos_in_sents).long()) # stop when there is a in each sentence, or if we exceed the maximul length if unfinished_sents.max() == 0: break # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = torch.cat( [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 ) # if there are different sentences lengths in the batch, some batches have to be padded if sent_lengths.min().item() != sent_lengths.max().item(): assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" # finished sents are filled with pad_token decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_(pad_token_id) else: decoded = input_ids for hypo_idx, hypo in enumerate(input_ids): decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]] return decoded def _generate_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, early_stopping, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, decoder_start_token_id, batch_size, num_return_sequences, length_penalty, num_beams, vocab_size, encoder_outputs, attention_mask, use_cache, model_specific_kwargs, ): """ Generate sequences for each example with beam search. """ # generated hypotheses generated_hyps = [ BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) for _ in range(batch_size) ] # scores for each sentence in the beam beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times if do_sample is False: beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) # cache compute states past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models # done sentences done = [False for _ in range(batch_size)] while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs ) outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: self.enforce_repetition_penalty_( next_token_logits, batch_size, num_beams, input_ids, repetition_penalty, ) if temperature != 1.0: next_token_logits = next_token_logits / temperature if self.config.is_encoder_decoder and do_sample is False: # TODO (PVP) still a bit hacky here - there might be a better solution next_token_logits = self.prepare_logits_for_generation( next_token_logits, cur_len=cur_len, max_length=max_length ) scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) # set eos token prob to zero if min_length is not reached if eos_token_id is not None and cur_len < min_length: scores[:, eos_token_id] = -float("inf") if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams num_batch_hypotheses = batch_size * num_beams # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_batch_tokens = calc_banned_ngram_tokens( input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len ) for i, banned_tokens in enumerate(banned_batch_tokens): scores[i, banned_tokens] = -float("inf") if bad_words_ids is not None: # calculate a list of banned tokens according to bad words banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) for i, banned_tokens in enumerate(banned_tokens): scores[i, banned_tokens] = -float("inf") assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format( scores.shape, (batch_size * num_beams, vocab_size) ) if do_sample: _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) # Top-p/top-k filtering _scores = top_k_top_p_filtering( _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 ) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together to sample from all beam_idxs _scores = _scores.contiguous().view( batch_size, num_beams * vocab_size ) # (batch_size, num_beams * vocab_size) # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) probs = F.softmax(_scores, dim=-1) next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) # (batch_size, num_beams * 2) # Compute next scores next_scores = torch.gather(_scores, -1, next_tokens) # (batch_size, num_beams * 2) # sort the sampled vector to make sure that the first num_beams samples are the best next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1) next_tokens = torch.gather(next_tokens, -1, next_scores_indices) # (batch_size, num_beams * 2) else: next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) next_scores = next_scores.view( batch_size, num_beams * vocab_size ) # (batch_size, num_beams * vocab_size) next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) # next batch beam content next_batch_beam = [] # for each sentence for batch_idx in range(batch_size): # if we are done with this sentence if done[batch_idx]: assert ( len(generated_hyps[batch_idx]) >= num_beams ), "Batch can only be done if at least {} beams have been generated".format(num_beams) assert ( eos_token_id is not None and pad_token_id is not None ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue # next sentence beam content next_sent_beam = [] # next tokens for this sentence for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens[batch_idx], next_scores[batch_idx]) ): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size effective_beam_id = batch_idx * num_beams + beam_id # add to generated hypotheses if end of sentence or last iteration if (eos_token_id is not None) and (token_id.item() == eos_token_id): # if beam_token does not belong to top num_beams tokens, it should not be added is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams if is_beam_token_worse_than_top_num_beams: continue generated_hyps[batch_idx].add( input_ids[effective_beam_id].clone(), beam_token_score.item(), ) else: # add next predicted token if it is not eos_token next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) # the beam for next step is full if len(next_sent_beam) == num_beams: break # Check if were done so that we can save a pad step if all(done) done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( next_scores[batch_idx].max().item(), cur_len=cur_len ) # update next beam content assert len(next_sent_beam) == num_beams, "Beam should always be full" next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == num_beams * (batch_idx + 1) # stop when we are done with each sentence if all(done): break # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) beam_idx = input_ids.new([x[2] for x in next_batch_beam]) # re-order batch and update current length input_ids = input_ids[beam_idx, :] input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) cur_len = cur_len + 1 # re-order internal states if past is not None: past = self._reorder_cache(past, beam_idx) # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = torch.cat( [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 ) # finalize all open beam hypotheses and end to generated hypotheses for batch_idx in range(batch_size): if done[batch_idx]: continue # test that beam scores match previously calculated scores if not eos and batch_idx not done if eos_token_id is not None and all( (token_id % vocab_size).item() is not eos_token_id for token_id in next_tokens[batch_idx] ): assert torch.all( next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx] ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx], ) # need to add best num_beams hypotheses to generated hyps for beam_id in range(num_beams): effective_beam_id = batch_idx * num_beams + beam_id final_score = beam_scores[effective_beam_id].item() final_tokens = input_ids[effective_beam_id] generated_hyps[batch_idx].add(final_tokens, final_score) # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch output_batch_size = batch_size if do_sample else batch_size * num_return_sequences output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences # select the best hypotheses sent_lengths = input_ids.new(output_batch_size) best = [] # retrieve best hypotheses for i, hypotheses in enumerate(generated_hyps): sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) for j in range(output_num_return_sequences_per_batch): effective_batch_idx = output_num_return_sequences_per_batch * i + j best_hyp = sorted_hyps.pop()[1] sent_lengths[effective_batch_idx] = len(best_hyp) best.append(best_hyp) # shorter batches are filled with pad_token if sent_lengths.min().item() != sent_lengths.max().item(): assert pad_token_id is not None, "`Pad_token_id` has to be defined" sent_max_len = min(sent_lengths.max().item() + 1, max_length) decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id) # fill with hypothesis and eos_token_id if necessary for i, hypo in enumerate(best): decoded[i, : sent_lengths[i]] = hypo if sent_lengths[i] < max_length: decoded[i, sent_lengths[i]] = eos_token_id else: # none of the hypotheses have an eos_token assert (len(hypo) == max_length for hypo in best) decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device) return decoded @staticmethod def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]: return tuple(layer_past.index_select(1, beam_idx) for layer_past in past) def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None: """Copied from fairseq for no_repeat_ngram in beam_search""" if cur_len + 1 < no_repeat_ngram_size: # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): gen_tokens = prev_input_ids[idx].tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - no_repeat_ngram_size ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist()) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] return banned_tokens def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]) -> Iterable[int]: banned_tokens = [] def _tokens_match(prev_tokens, tokens): if len(tokens) == 0: # if bad word tokens is just one token always ban it return True if len(tokens) > len(prev_input_ids): # if bad word tokens are longer then prev input_ids they can't be equal return False if prev_tokens[-len(tokens) :] == tokens: # if tokens match return True else: return False for prev_input_ids_slice in prev_input_ids: banned_tokens_slice = [] for banned_token_seq in bad_words_ids: assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( bad_words_ids ) if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False: # if tokens do not match continue continue banned_tokens_slice.append(banned_token_seq[-1]) banned_tokens.append(banned_tokens_slice) return banned_tokens def top_k_top_p_filtering( logits: Tensor, top_k: int = 0, top_p: float = 1.0, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1, ) -> Tensor: """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) if top_k > 0: keep only top k tokens with highest probability (top-k filtering). if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) Make sure we keep at least min_tokens_to_keep per batch example in the output From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ if top_k > 0: top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) logits[indices_to_remove] = filter_value return logits class BeamHypotheses(object): def __init__(self, num_beams, max_length, length_penalty, early_stopping): """ Initialize n-best list of hypotheses. """ self.max_length = max_length - 1 # ignoring bos_token self.length_penalty = length_penalty self.early_stopping = early_stopping self.num_beams = num_beams self.beams = [] self.worst_score = 1e9 def __len__(self): """ Number of hypotheses in the list. """ return len(self.beams) def add(self, hyp, sum_logprobs): """ Add a new hypothesis to the list. """ score = sum_logprobs / len(hyp) ** self.length_penalty if len(self) < self.num_beams or score > self.worst_score: self.beams.append((score, hyp)) if len(self) > self.num_beams: sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) del self.beams[sorted_scores[0][1]] self.worst_score = sorted_scores[1][0] else: self.worst_score = min(score, self.worst_score) def is_done(self, best_sum_logprobs, cur_len=None): """ If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst one in the heap, then we are done with this sentence. """ if len(self) < self.num_beams: return False elif self.early_stopping: return True else: if cur_len is None: cur_len = self.max_length cur_score = best_sum_logprobs / cur_len ** self.length_penalty ret = self.worst_score >= cur_score return ret class Conv1D(nn.Module): def __init__(self, nf, nx): """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) Basically works like a Linear layer but the weights are transposed """ super().__init__() self.nf = nf w = torch.empty(nx, nf) nn.init.normal_(w, std=0.02) self.weight = nn.Parameter(w) self.bias = nn.Parameter(torch.zeros(nf)) def forward(self, x): size_out = x.size()[:-1] + (self.nf,) x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) x = x.view(*size_out) return x class PoolerStartLogits(nn.Module): """ Compute SQuAD start_logits from sequence hidden states. """ def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, 1) def forward(self, hidden_states, p_mask=None): """ Args: **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)` invalid position mask such as query and special symbols (PAD, SEP, CLS) 1.0 means token should be masked. """ x = self.dense(hidden_states).squeeze(-1) if p_mask is not None: if next(self.parameters()).dtype == torch.float16: x = x * (1 - p_mask) - 65500 * p_mask else: x = x * (1 - p_mask) - 1e30 * p_mask return x class PoolerEndLogits(nn.Module): """ Compute SQuAD end_logits from sequence hidden states and start token hidden state. """ def __init__(self, config): super().__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) self.activation = nn.Tanh() self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dense_1 = nn.Linear(config.hidden_size, 1) def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None): """ Args: One of ``start_states``, ``start_positions`` should be not None. If both are set, ``start_positions`` overrides ``start_states``. **start_states**: ``torch.LongTensor`` of shape identical to hidden_states hidden states of the first tokens for the labeled span. **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the first token for the labeled span: **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` Mask of invalid position such as query and special symbols (PAD, SEP, CLS) 1.0 means token should be masked. """ assert ( start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: slen, hsz = hidden_states.shape[-2:] start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) x = self.activation(x) x = self.LayerNorm(x) x = self.dense_1(x).squeeze(-1) if p_mask is not None: if next(self.parameters()).dtype == torch.float16: x = x * (1 - p_mask) - 65500 * p_mask else: x = x * (1 - p_mask) - 1e30 * p_mask return x class PoolerAnswerClass(nn.Module): """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ def __init__(self, config): super().__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) self.activation = nn.Tanh() self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None): """ Args: One of ``start_states``, ``start_positions`` should be not None. If both are set, ``start_positions`` overrides ``start_states``. **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``. hidden states of the first tokens for the labeled span. **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the first token for the labeled span. **cls_index**: torch.LongTensor of shape ``(batch_size,)`` position of the CLS token. If None, take the last token. note(Original repo): no dependency on end_feature so that we can obtain one single `cls_logits` for each sample """ hsz = hidden_states.shape[-1] assert ( start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) if cls_index is not None: cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) else: cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) x = self.activation(x) x = self.dense_1(x).squeeze(-1) return x class SQuADHead(nn.Module): r""" A SQuAD head inspired by XLNet. Parameters: config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model. Inputs: **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)`` hidden states of sequence tokens **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the first token for the labeled span. **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` position of the last token for the labeled span. **cls_index**: torch.LongTensor of shape ``(batch_size,)`` position of the CLS token. If None, take the last token. **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)`` Whether the question has a possible answer in the paragraph or not. **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` Mask of invalid position such as query and special symbols (PAD, SEP, CLS) 1.0 means token should be masked. Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)`` Log probabilities for the top config.start_n_top start token possibilities (beam-search). **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)`` Indices for the top config.start_n_top start token possibilities (beam-search). **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) ``torch.FloatTensor`` of shape ``(batch_size,)`` Log probabilities for the ``is_impossible`` label of the answers. """ def __init__(self, config): super().__init__() self.start_n_top = config.start_n_top self.end_n_top = config.end_n_top self.start_logits = PoolerStartLogits(config) self.end_logits = PoolerEndLogits(config) self.answer_class = PoolerAnswerClass(config) def forward( self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None, ): outputs = () start_logits = self.start_logits(hidden_states, p_mask=p_mask) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, let's remove the dimension added by batch splitting for x in (start_positions, end_positions, cls_index, is_impossible): if x is not None and x.dim() > 1: x.squeeze_(-1) # during training, compute the end logits based on the ground truth of the start position end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) loss_fct = CrossEntropyLoss() start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if cls_index is not None and is_impossible is not None: # Predict answerability from the representation of CLS and START cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) loss_fct_cls = nn.BCEWithLogitsLoss() cls_loss = loss_fct_cls(cls_logits, is_impossible) # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 outputs = (total_loss,) + outputs else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) start_top_log_probs, start_top_index = torch.topk( start_log_probs, self.start_n_top, dim=-1 ) # shape (bsz, start_n_top) start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( start_states ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) end_top_log_probs, end_top_index = torch.topk( end_log_probs, self.end_n_top, dim=1 ) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits,) + outputs # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits # or (if labels are provided) (total_loss,) return outputs class SequenceSummary(nn.Module): r""" Compute a single vector summary of a sequence hidden states according to various possibilities: Args of the config class: summary_type: - 'last' => [default] take the last token hidden state (like XLNet) - 'first' => take the first token hidden state (like Bert) - 'mean' => take the mean of all tokens hidden states - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'attn' => Not implemented now, use multi-head attention summary_use_proj: Add a projection after the vector extraction summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ def __init__(self, config: PretrainedConfig): super().__init__() self.summary_type = getattr(config, "summary_type", "last") if self.summary_type == "attn": # We should use a standard multi-head attention module with absolute positional embedding for that. # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # We can probably just use the multi-head attention module of PyTorch >=1.1.0 raise NotImplementedError self.summary = Identity() if hasattr(config, "summary_use_proj") and config.summary_use_proj: if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: num_classes = config.num_labels else: num_classes = config.hidden_size self.summary = nn.Linear(config.hidden_size, num_classes) activation_string = getattr(config, "summary_activation", None) self.activation: Callable = (get_activation(activation_string) if activation_string else Identity()) self.first_dropout = Identity() if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: self.first_dropout = nn.Dropout(config.summary_first_dropout) self.last_dropout = Identity() if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: self.last_dropout = nn.Dropout(config.summary_last_dropout) def forward(self, hidden_states, cls_index=None): """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer. cls_index: [optional] position of the classification token if summary_type == 'cls_index', shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. if summary_type == 'cls_index' and cls_index is None: we take the last token of the sequence as classification token """ if self.summary_type == "last": output = hidden_states[:, -1] elif self.summary_type == "first": output = hidden_states[:, 0] elif self.summary_type == "mean": output = hidden_states.mean(dim=1) elif self.summary_type == "cls_index": if cls_index is None: cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,) else: cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) elif self.summary_type == "attn": raise NotImplementedError output = self.first_dropout(output) output = self.summary(output) output = self.activation(output) output = self.last_dropout(output) return output def create_position_ids_from_input_ids(input_ids, padding_idx): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param torch.Tensor x: :return torch.Tensor: """ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. mask = input_ids.ne(padding_idx).int() incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask return incremental_indices.long() + padding_idx def prune_linear_layer(layer, index, dim=0): """ Prune a linear layer (a model parameters) to keep only entries in index. Return the pruned layer as a new layer with requires_grad=True. Used to remove heads. """ index = index.to(layer.weight.device) W = layer.weight.index_select(dim, index).clone().detach() if layer.bias is not None: if dim == 1: b = layer.bias.clone().detach() else: b = layer.bias[index].clone().detach() new_size = list(layer.weight.size()) new_size[dim] = len(index) new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) new_layer.weight.requires_grad = False new_layer.weight.copy_(W.contiguous()) new_layer.weight.requires_grad = True if layer.bias is not None: new_layer.bias.requires_grad = False new_layer.bias.copy_(b.contiguous()) new_layer.bias.requires_grad = True return new_layer def prune_conv1d_layer(layer, index, dim=1): """ Prune a Conv1D layer (a model parameters) to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed. Return the pruned layer as a new layer with requires_grad=True. Used to remove heads. """ index = index.to(layer.weight.device) W = layer.weight.index_select(dim, index).clone().detach() if dim == 0: b = layer.bias.clone().detach() else: b = layer.bias[index].clone().detach() new_size = list(layer.weight.size()) new_size[dim] = len(index) new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) new_layer.weight.requires_grad = False new_layer.weight.copy_(W.contiguous()) new_layer.weight.requires_grad = True new_layer.bias.requires_grad = False new_layer.bias.copy_(b.contiguous()) new_layer.bias.requires_grad = True return new_layer def prune_layer(layer, index, dim=None): """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index. Return the pruned layer as a new layer with requires_grad=True. Used to remove heads. """ if isinstance(layer, nn.Linear): return prune_linear_layer(layer, index, dim=0 if dim is None else dim) elif isinstance(layer, Conv1D): return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) else: raise ValueError("Can't prune layer of class {}".format(layer.__class__)) def apply_chunking_to_forward( chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors ) -> torch.Tensor: """ This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory. If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as not applying it. Args: chunk_size: int - the chunk size of a chunked tensor. `num_chunks` = `len(input_tensors[0]) / chunk_size` chunk_dim: int - the dimension over which the input_tensors should be chunked forward_fn: fn - the forward fn of the model input_tensors: tuple(torch.Tensor) - the input tensors of `forward_fn` which are chunked Returns: a Tensor with the same shape the foward_fn would have given if applied Examples:: # rename the usual forward() fn to forward_chunk() def forward_chunk(self, hidden_states): hidden_states = self.decoder(hidden_states) return hidden_states # implement a chunked forward function def forward(self, hidden_states): return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states) """ assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors) tensor_shape = input_tensors[0].shape assert all( input_tensor.shape == tensor_shape for input_tensor in input_tensors ), "All input tenors have to be of the same shape" # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compability num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters) assert num_args_in_forward_chunk_fn == len( input_tensors ), "forward_chunk_fn expects {} arguments, but only {} input tensors are given".format( num_args_in_forward_chunk_fn, len(input_tensors) ) if chunk_size > 0: assert ( input_tensors[0].shape[chunk_dim] % chunk_size == 0 ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format( input_tensors[0][chunk_dim], chunk_size ) num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size # chunk input tensor into tuples input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors) # apply forward fn to every tuple output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)) # concatenate output at same dimension return torch.cat(output_chunks, dim=chunk_dim) return forward_fn(*input_tensors) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_xlm.py ================================================ # coding=utf-8 # Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch XLM model. """ import itertools import logging import math import numpy as np import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F from .activations import gelu from .configuration_xlm import XLMConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead, prune_linear_layer logger = logging.getLogger(__name__) XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlm-mlm-en-2048", "xlm-mlm-ende-1024", "xlm-mlm-enfr-1024", "xlm-mlm-enro-1024", "xlm-mlm-tlm-xnli15-1024", "xlm-mlm-xnli15-1024", "xlm-clm-enfr-1024", "xlm-clm-ende-1024", "xlm-mlm-17-1280", "xlm-mlm-100-1280", # See all XLM models at https://huggingface.co/models?filter=xlm ] def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False def get_masks(slen, lengths, causal, padding_mask=None): """ Generate hidden states mask, and optionally an attention mask. """ alen = torch.arange(slen, dtype=torch.long, device=lengths.device) if padding_mask is not None: mask = padding_mask else: assert lengths.max().item() <= slen mask = alen < lengths[:, None] # attention mask is the same as mask, or triangular inferior attention (causal) bs = lengths.size(0) if causal: attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None] else: attn_mask = mask # sanity check assert mask.size() == (bs, slen) assert causal is False or attn_mask.size() == (bs, slen, slen) return mask, attn_mask class MultiHeadAttention(nn.Module): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config): super().__init__() self.layer_id = next(MultiHeadAttention.NEW_ID) self.output_attentions = config.output_attentions self.dim = dim self.n_heads = n_heads self.dropout = config.attention_dropout assert self.dim % self.n_heads == 0 self.q_lin = nn.Linear(dim, dim) self.k_lin = nn.Linear(dim, dim) self.v_lin = nn.Linear(dim, dim) self.out_lin = nn.Linear(dim, dim) self.pruned_heads = set() def prune_heads(self, heads): attention_head_size = self.dim // self.n_heads if len(heads) == 0: return mask = torch.ones(self.n_heads, attention_head_size) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.q_lin = prune_linear_layer(self.q_lin, index) self.k_lin = prune_linear_layer(self.k_lin, index) self.v_lin = prune_linear_layer(self.v_lin, index) self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, input, mask, kv=None, cache=None, head_mask=None): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = input.size() if kv is None: klen = qlen if cache is None else cache["slen"] + qlen else: klen = kv.size(1) # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) n_heads = self.n_heads dim_per_head = self.dim // n_heads mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen) def shape(x): """ projection """ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) def unshape(x): """ compute context """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, qlen, klen) weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) context = unshape(context) # (bs, qlen, dim) outputs = (self.out_lin(context),) if self.output_attentions: outputs = outputs + (weights,) return outputs class TransformerFFN(nn.Module): def __init__(self, in_dim, dim_hidden, out_dim, config): super().__init__() self.dropout = config.dropout self.lin1 = nn.Linear(in_dim, dim_hidden) self.lin2 = nn.Linear(dim_hidden, out_dim) self.act = gelu if config.gelu_activation else F.relu def forward(self, input): x = self.lin1(input) x = self.act(x) x = self.lin2(x) x = F.dropout(x, p=self.dropout, training=self.training) return x class XLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLMConfig load_tf_weights = None base_model_prefix = "transformer" def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) @property def dummy_inputs(self): inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) if self.config.use_lang_emb and self.config.n_langs > 1: langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) else: langs_list = None return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, nn.Embedding): if self.config is not None and self.config.embed_init_std is not None: nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std) if isinstance(module, nn.Linear): if self.config is not None and self.config.init_std is not None: nn.init.normal_(module.weight, mean=0, std=self.config.init_std) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, 0.0) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) XLM_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.XLMConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLM_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ langs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are languages ids which can be obtained from the language names by using two conversion mappings provided in the configuration of the model (only provided for multilingual models). More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). See usage examples detailed in the `multilingual documentation `__. token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`_ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`): dictionary with ``torch.FloatTensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @add_start_docstrings( "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.", XLM_START_DOCSTRING, ) class XLMModel(XLMPreTrainedModel): def __init__(self, config): # , dico, is_encoder, with_output): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states # encoder / decoder, output layer self.is_encoder = config.is_encoder self.is_decoder = not config.is_encoder if self.is_decoder: raise NotImplementedError("Currently XLM can only be used as an encoder") # self.with_output = with_output self.causal = config.causal # dictionary / languages self.n_langs = config.n_langs self.use_lang_emb = config.use_lang_emb self.n_words = config.n_words self.eos_index = config.eos_index self.pad_index = config.pad_index # self.dico = dico # self.id2lang = config.id2lang # self.lang2id = config.lang2id # assert len(self.dico) == self.n_words # assert len(self.id2lang) == len(self.lang2id) == self.n_langs # model parameters self.dim = config.emb_dim # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default self.n_heads = config.n_heads # 8 by default self.n_layers = config.n_layers self.dropout = config.dropout self.attention_dropout = config.attention_dropout assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" # embeddings self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) if config.sinusoidal_embeddings: create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) if config.n_langs > 1 and config.use_lang_emb: self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps) # transformer layers self.attentions = nn.ModuleList() self.layer_norm1 = nn.ModuleList() self.ffns = nn.ModuleList() self.layer_norm2 = nn.ModuleList() # if self.is_decoder: # self.layer_norm15 = nn.ModuleList() # self.encoder_attn = nn.ModuleList() for _ in range(self.n_layers): self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config)) self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # if self.is_decoder: # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config)) self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) if hasattr(config, "pruned_heads"): pruned_heads = config.pruned_heads.copy().items() config.pruned_heads = {} for layer, heads in pruned_heads: if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) self.init_weights() def get_input_embeddings(self): return self.embeddings def set_input_embeddings(self, new_embeddings): self.embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.attentions[layer].prune_heads(heads) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMModel import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMModel.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None: bs, slen = input_ids.size() else: bs, slen = inputs_embeds.size()[:-1] if lengths is None: if input_ids is not None: lengths = (input_ids != self.pad_index).sum(dim=1).long() else: lengths = torch.LongTensor([slen] * bs) # mask = input_ids != self.pad_index # check inputs assert lengths.size(0) == bs assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) # if src_enc is not None: # assert self.is_decoder # assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] device = input_ids.device if input_ids is not None else inputs_embeds.device # position_ids if position_ids is None: position_ids = torch.arange(slen, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand((bs, slen)) else: assert position_ids.size() == (bs, slen) # (slen, bs) # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: assert langs.size() == (bs, slen) # (slen, bs) # langs = langs.transpose(0, 1) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layers) # do not recompute cached elements if cache is not None and input_ids is not None: _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds) if langs is not None and self.use_lang_emb and self.n_langs > 1: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) tensor = F.dropout(tensor, p=self.dropout, training=self.training) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # transformer layers hidden_states = () attentions = () for i in range(self.n_layers): if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i]) attn = attn_outputs[0] if self.output_attentions: attentions = attentions + (attn_outputs[1],) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) # attn = F.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) # FFN tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # Add last hidden state if self.output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length if cache is not None: cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) outputs = (tensor,) if self.output_hidden_states: outputs = outputs + (hidden_states,) if self.output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) class XLMPredLayer(nn.Module): """ Prediction layer (cross_entropy or adaptive_softmax). """ def __init__(self, config): super().__init__() self.asm = config.asm self.n_words = config.n_words self.pad_index = config.pad_index dim = config.emb_dim if config.asm is False: self.proj = nn.Linear(dim, config.n_words, bias=True) else: self.proj = nn.AdaptiveLogSoftmaxWithLoss( in_features=dim, n_classes=config.n_words, cutoffs=config.asm_cutoffs, div_value=config.asm_div_value, head_bias=True, # default is False ) def forward(self, x, y=None): """ Compute the loss, and optionally the scores. """ outputs = () if self.asm is False: scores = self.proj(x) outputs = (scores,) + outputs if y is not None: loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean") outputs = (loss,) + outputs else: scores = self.proj.log_prob(x) outputs = (scores,) + outputs if y is not None: _, loss = self.proj(x, y) outputs = (loss,) + outputs return outputs @add_start_docstrings( """The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLM_START_DOCSTRING, ) class XLMWithLMHeadModel(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLMModel(config) self.pred_layer = XLMPredLayer(config) self.init_weights() def get_output_embeddings(self): return self.pred_layer.proj def prepare_inputs_for_generation(self, input_ids, **kwargs): mask_token_id = self.config.mask_token_id lang_id = self.config.lang_id effective_batch_size = input_ids.shape[0] mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device) input_ids = torch.cat([input_ids, mask_token], dim=1) if lang_id is not None: langs = torch.full_like(input_ids, lang_id) else: langs = None return {"input_ids": input_ids, "langs": langs} @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMWithLMHeadModel import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) output = transformer_outputs[0] outputs = self.pred_layer(output, labels) outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_START_DOCSTRING, ) class XLMForSequenceClassification(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLMModel(config) self.sequence_summary = SequenceSummary(config) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForSequenceClassification import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) output = transformer_outputs[0] logits = self.sequence_summary(output) outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs @add_start_docstrings( """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLM_START_DOCSTRING, ) class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLMModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForQuestionAnsweringSimple import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = transformer_outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = ( start_logits, end_logits, ) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLM_START_DOCSTRING, ) class XLMForQuestionAnswering(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLMModel(config) self.qa_outputs = SQuADHead(config) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels whether a question has an answer or no answer (SQuAD 2.0) cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for position (index) of the classification token to use as input for computing plausibility of the answer. p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be masked. 0.0 mean token is not masked. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top config.start_n_top start token possibilities (beam-search). start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top config.start_n_top start token possibilities (beam-search). end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the ``is_impossible`` label of the answers. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForQuestionAnswering import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds, ) output = transformer_outputs[0] outputs = self.qa_outputs( output, start_positions=start_positions, end_positions=end_positions, cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask, ) outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here return outputs @add_start_docstrings( """XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM_START_DOCSTRING, ) class XLMForTokenClassification(XLMPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLMModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) def forward( self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLMConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLMTokenizer, XLMForTokenClassification import torch tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.transformer( input_ids, attention_mask=attention_mask, langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch XLM-RoBERTa model. """ import logging from .configuration_xlm_roberta import XLMRobertaConfig from .file_utils import add_start_docstrings from .modeling_roberta import ( RobertaForMaskedLM, RobertaForMultipleChoice, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, ) logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlm-roberta-base", "xlm-roberta-large", "xlm-roberta-large-finetuned-conll02-dutch", "xlm-roberta-large-finetuned-conll02-spanish", "xlm-roberta-large-finetuned-conll03-english", "xlm-roberta-large-finetuned-conll03-german", # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta ] XLM_ROBERTA_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ @add_start_docstrings( "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaModel(RobertaModel): """ This class overrides :class:`~transformers1.RobertaModel`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForMaskedLM(RobertaForMaskedLM): """ This class overrides :class:`~transformers1.RobertaForMaskedLM`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): """ This class overrides :class:`~transformers1.RobertaForSequenceClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForMultipleChoice(RobertaForMultipleChoice): """ This class overrides :class:`~transformers1.RobertaForMultipleChoice`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig @add_start_docstrings( """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaForTokenClassification(RobertaForTokenClassification): """ This class overrides :class:`~transformers1.RobertaForTokenClassification`. Please check the superclass for the appropriate documentation alongside usage examples. """ config_class = XLMRobertaConfig ================================================ FILE: code/bert-base-count5/pretrain/transformers1/modeling_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch XLNet model. """ import logging import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F from .activations import gelu_new, swish from .configuration_xlnet import XLNetConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary logger = logging.getLogger(__name__) XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ "xlnet-base-cased", "xlnet-large-cased", # See all XLNet models at https://huggingface.co/models?filter=xlnet ] def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): """ A map of modules from TF to PyTorch. I use a map to keep the PyTorch model as identical to the original PyTorch model as possible. """ tf_to_pt_map = {} if hasattr(model, "transformer"): if hasattr(model, "lm_loss"): # We will load also the output bias tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights: # We will load also the sequence summary tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias if ( hasattr(model, "logits_proj") and config.finetuning_task is not None and "model/regression_{}/logit/kernel".format(config.finetuning_task) in tf_weights ): tf_to_pt_map["model/regression_{}/logit/kernel".format(config.finetuning_task)] = model.logits_proj.weight tf_to_pt_map["model/regression_{}/logit/bias".format(config.finetuning_task)] = model.logits_proj.bias # Now load the rest of the transformer model = model.transformer # Embeddings and output tf_to_pt_map.update( { "model/transformer/word_embedding/lookup_table": model.word_embedding.weight, "model/transformer/mask_emb/mask_emb": model.mask_emb, } ) # Transformer blocks for i, b in enumerate(model.layer): layer_str = "model/transformer/layer_%d/" % i tf_to_pt_map.update( { layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight, layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias, layer_str + "rel_attn/o/kernel": b.rel_attn.o, layer_str + "rel_attn/q/kernel": b.rel_attn.q, layer_str + "rel_attn/k/kernel": b.rel_attn.k, layer_str + "rel_attn/r/kernel": b.rel_attn.r, layer_str + "rel_attn/v/kernel": b.rel_attn.v, layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight, layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias, layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight, layer_str + "ff/layer_1/bias": b.ff.layer_1.bias, layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight, layer_str + "ff/layer_2/bias": b.ff.layer_2.bias, } ) # Relative positioning biases if config.untie_r: r_r_list = [] r_w_list = [] r_s_list = [] seg_embed_list = [] for b in model.layer: r_r_list.append(b.rel_attn.r_r_bias) r_w_list.append(b.rel_attn.r_w_bias) r_s_list.append(b.rel_attn.r_s_bias) seg_embed_list.append(b.rel_attn.seg_embed) else: r_r_list = [model.r_r_bias] r_w_list = [model.r_w_bias] r_s_list = [model.r_s_bias] seg_embed_list = [model.seg_embed] tf_to_pt_map.update( { "model/transformer/r_r_bias": r_r_list, "model/transformer/r_w_bias": r_w_list, "model/transformer/r_s_bias": r_s_list, "model/transformer/seg_embed": seg_embed_list, } ) return tf_to_pt_map def load_tf_weights_in_xlnet(model, config, tf_path): """ Load tf checkpoints in a pytorch model """ try: import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise # Load weights from TF model init_vars = tf.train.list_variables(tf_path) tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) tf_weights[name] = array # Build TF to PyTorch weights loading map tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights) for name, pointer in tf_to_pt_map.items(): logger.info("Importing {}".format(name)) if name not in tf_weights: logger.info("{} not in tf pre-trained weights, skipping".format(name)) continue array = tf_weights[name] # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name): logger.info("Transposing") array = np.transpose(array) if isinstance(pointer, list): # Here we will split the TF weights assert len(pointer) == array.shape[0] for i, p_i in enumerate(pointer): arr_i = array[i, ...] try: assert p_i.shape == arr_i.shape except AssertionError as e: e.args += (p_i.shape, arr_i.shape) raise logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) p_i.data = torch.from_numpy(arr_i) else: try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) tf_weights.pop(name, None) tf_weights.pop(name + "/Adam", None) tf_weights.pop(name + "/Adam_1", None) logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) return model ACT2FN = {"gelu": gelu_new, "relu": torch.nn.functional.relu, "swish": swish} XLNetLayerNorm = nn.LayerNorm class XLNetRelativeAttention(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions if config.d_model % config.n_head != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.d_model, config.n_head) ) self.n_head = config.n_head self.d_head = config.d_head self.d_model = config.d_model self.scale = 1 / (config.d_head ** 0.5) self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head)) self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.dropout) def prune_heads(self, heads): raise NotImplementedError @staticmethod def rel_shift(x, klen=-1): """perform relative shift to form the relative attention score.""" x_size = x.shape x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3]) x = x[1:, ...] x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3]) # x = x[:, 0:klen, :, :] x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long)) return x @staticmethod def rel_shift_bnij(x, klen=-1): x_size = x.shape x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2]) x = x[:, :, 1:, :] x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1) # Note: the tensor-slice form was faster in my testing than torch.index_select # However, tracing doesn't like the nature of the slice, and if klen changes # during the run then it'll fail, whereas index_select will be fine. x = torch.index_select(x, 3, torch.arange(klen, device=x.device, dtype=torch.long)) # x = x[:, :, :, :klen] return x def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None, head_mask=None): """Core relative positional attention operations.""" # content based attention score ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h) # position based attention score bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r) bd = self.rel_shift_bnij(bd, klen=ac.shape[3]) # segment based attention score if seg_mat is None: ef = 0 else: ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed) ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef) # merge attention scores and perform masking attn_score = (ac + bd + ef) * self.scale if attn_mask is not None: # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask if attn_mask.dtype == torch.float16: attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask) else: attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask) # attention probability attn_prob = F.softmax(attn_score, dim=3) attn_prob = self.dropout(attn_prob) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask) # attention output attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h) if self.output_attentions: return attn_vec, torch.einsum("bnij->ijbn", attn_prob) return attn_vec def post_attention(self, h, attn_vec, residual=True): """Post-attention processing.""" # post-attention projection (back to `d_model`) attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o) attn_out = self.dropout(attn_out) if residual: attn_out = attn_out + h output = self.layer_norm(attn_out) return output def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None): if g is not None: # Two-stream attention with relative positional encoding. # content based attention score if mems is not None and mems.dim() > 1: cat = torch.cat([mems, h], dim=0) else: cat = h # content-based key head k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k) # content-based value head v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v) # position-based key head k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r) # h-stream # content-stream query head q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q) # core attention ops attn_vec_h = self.rel_attn_core( q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask ) if self.output_attentions: attn_vec_h, attn_prob_h = attn_vec_h # post processing output_h = self.post_attention(h, attn_vec_h) # g-stream # query-stream query head q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q) # core attention ops if target_mapping is not None: q_head_g = torch.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g attn_vec_g = torch.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g # post processing output_g = self.post_attention(g, attn_vec_g) if self.output_attentions: attn_prob = attn_prob_h, attn_prob_g else: # Multi-head attention with relative positional encoding if mems is not None and mems.dim() > 1: cat = torch.cat([mems, h], dim=0) else: cat = h # content heads q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q) k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k) v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v) # positional heads k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r) # core attention ops attn_vec = self.rel_attn_core( q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask ) if self.output_attentions: attn_vec, attn_prob = attn_vec # post processing output_h = self.post_attention(h, attn_vec) output_g = None outputs = (output_h, output_g) if self.output_attentions: outputs = outputs + (attn_prob,) return outputs class XLNetFeedForward(nn.Module): def __init__(self, config): super().__init__() self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps) self.layer_1 = nn.Linear(config.d_model, config.d_inner) self.layer_2 = nn.Linear(config.d_inner, config.d_model) self.dropout = nn.Dropout(config.dropout) if isinstance(config.ff_activation, str): self.activation_function = ACT2FN[config.ff_activation] else: self.activation_function = config.ff_activation def forward(self, inp): output = inp output = self.layer_1(output) output = self.activation_function(output) output = self.dropout(output) output = self.layer_2(output) output = self.dropout(output) output = self.layer_norm(output + inp) return output class XLNetLayer(nn.Module): def __init__(self, config): super().__init__() self.rel_attn = XLNetRelativeAttention(config) self.ff = XLNetFeedForward(config) self.dropout = nn.Dropout(config.dropout) def forward( self, output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None ): outputs = self.rel_attn( output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=mems, target_mapping=target_mapping, head_mask=head_mask, ) output_h, output_g = outputs[:2] if output_g is not None: output_g = self.ff(output_g) output_h = self.ff(output_h) outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there return outputs class XLNetPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = XLNetConfig load_tf_weights = load_tf_weights_in_xlnet base_model_prefix = "transformer" def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, XLNetLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, XLNetRelativeAttention): for param in [ module.q, module.k, module.v, module.o, module.r, module.r_r_bias, module.r_s_bias, module.r_w_bias, module.seg_embed, ]: param.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, XLNetModel): module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range) XLNET_START_DOCSTRING = r""" This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers1.XLNetConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers1.PreTrainedModel.from_pretrained` method to load the model weights. """ XLNET_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers1.BertTokenizer`. See :func:`transformers1.PreTrainedTokenizer.encode` and :func:`transformers1.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems given to this model should not be passed as input ids as they have already been computed. `use_cache` has to be set to `True` to make use of `mems`. perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``: If ``perm_mask[k, i, j] = 0``, i attend to j in batch k; if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k. If None, each token attends to all the others (full bidirectional attention). Only used during pretraining (to define factorization order) or for sequential decoding (generation). target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to indicate the output tokens to use. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding (generation). token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token. The classifier token should be represented by a ``2``. `What are token type IDs? <../glossary.html#token-type-ids>`_ input_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding. Kept for compatibility with the original code base. You can only uses one of `input_mask` and `attention_mask` Mask values selected in ``[0, 1]``: ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED. head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. use_cache (:obj:`bool`): If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. """ @add_start_docstrings( "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.", XLNET_START_DOCSTRING, ) class XLNetModel(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.mem_len = config.mem_len self.reuse_len = config.reuse_len self.d_model = config.d_model self.same_length = config.same_length self.attn_type = config.attn_type self.bi_data = config.bi_data self.clamp_len = config.clamp_len self.n_layer = config.n_layer self.word_embedding = nn.Embedding(config.vocab_size, config.d_model) self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model)) self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)]) self.dropout = nn.Dropout(config.dropout) self.init_weights() def get_input_embeddings(self): return self.word_embedding def set_input_embeddings(self, new_embeddings): self.word_embedding = new_embeddings def _prune_heads(self, heads_to_prune): raise NotImplementedError def create_mask(self, qlen, mlen): """ Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. Args: qlen: Sequence length mlen: Mask length :: same_length=False: same_length=True: < qlen > < qlen > ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1] qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1] [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1] v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0] """ attn_mask = torch.ones([qlen, qlen]) mask_up = torch.triu(attn_mask, diagonal=1) attn_mask_pad = torch.zeros([qlen, mlen]) ret = torch.cat([attn_mask_pad, mask_up], dim=1) if self.same_length: mask_lo = torch.tril(attn_mask, diagonal=-1) ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1) ret = ret.to(self.device) return ret def cache_mem(self, curr_out, prev_mem): # cache hidden states into memory. if self.reuse_len is not None and self.reuse_len > 0: curr_out = curr_out[: self.reuse_len] if prev_mem is None: new_mem = curr_out[-self.mem_len :] else: new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len :] return new_mem.detach() @staticmethod def positional_embedding(pos_seq, inv_freq, bsz=None): sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq) pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1) pos_emb = pos_emb[:, None, :] if bsz is not None: pos_emb = pos_emb.expand(-1, bsz, -1) return pos_emb def relative_positional_encoding(self, qlen, klen, bsz=None): # create relative positional encoding. freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float) inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model)) if self.attn_type == "bi": # beg, end = klen - 1, -qlen beg, end = klen, -qlen elif self.attn_type == "uni": # beg, end = klen - 1, -1 beg, end = klen, -1 else: raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) if self.bi_data: fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float) bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float) if self.clamp_len > 0: fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) if bsz is not None: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2) else: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1) else: fwd_pos_seq = torch.arange(beg, end, -1.0) if self.clamp_len > 0: fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz) pos_emb = pos_emb.to(self.device) return pos_emb @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, hidden_size)`): Sequence of hidden-states at the last layer of the model. `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetModel import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetModel.from_pretrained('xlnet-large-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = input_ids.transpose(0, 1).contiguous() qlen, bsz = input_ids.shape[0], input_ids.shape[1] elif inputs_embeds is not None: inputs_embeds = inputs_embeds.transpose(0, 1).contiguous() qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen dtype_float = self.dtype device = self.device # Attention mask # causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = attn_mask[:, :, None, None] elif self.attn_type == "bi": attn_mask = None else: raise ValueError("Unsupported attention type: {}".format(self.attn_type)) # data mask: input mask & perm mask assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." if input_mask is None and attention_mask is not None: input_mask = 1.0 - attention_mask if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: data_mask = input_mask[None] elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # all mems can be attended to if mlen > 0: mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask) data_mask = torch.cat([mems_mask, data_mask], dim=1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] else: attn_mask += data_mask[:, :, :, None] if attn_mask is not None: attn_mask = (attn_mask > 0).to(dtype_float) if attn_mask is not None: non_tgt_mask = -torch.eye(qlen).to(attn_mask) if mlen > 0: non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1) non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask) else: non_tgt_mask = None # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k) if target_mapping is not None: word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1) # else: # We removed the inp_q input which was same as target mapping # inp_q_ext = inp_q[:, :, None] # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k output_g = self.dropout(word_emb_q) else: output_g = None # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device) cat_ids = torch.cat([mem_pad, token_type_ids], dim=0) else: cat_ids = token_type_ids # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long() seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float) else: seg_mat = None # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz) pos_emb = self.dropout(pos_emb) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0) head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) head_mask = head_mask.to( dtype=next(self.parameters()).dtype ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer new_mems = () if mems is None: mems = [None] * len(self.layer) attentions = [] hidden_states = [] for i, layer_module in enumerate(self.layer): if self.mem_len is not None and self.mem_len > 0 and use_cache is True: # cache new mems new_mems = new_mems + (self.cache_mem(output_h, mems[i]),) if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) outputs = layer_module( output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping, head_mask=head_mask[i], ) output_h, output_g = outputs[:2] if self.output_attentions: attentions.append(outputs[2]) # Add last hidden state if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) outputs = (output.permute(1, 0, 2).contiguous(),) if self.mem_len is not None and self.mem_len > 0 and use_cache is True: outputs = outputs + (new_mems,) if self.output_hidden_states: if output_g is not None: hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs) else: hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states) outputs = outputs + (hidden_states,) if self.output_attentions: if target_mapping is not None: # when target_mapping is provided, there are 2-tuple of attentions attentions = tuple( tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions ) else: attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) outputs = outputs + (attentions,) return outputs # outputs, (new_mems), (hidden_states), (attentions) @add_start_docstrings( """XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings). """, XLNET_START_DOCSTRING, ) class XLNetLMHeadModel(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.attn_type = config.attn_type self.same_length = config.same_length self.transformer = XLNetModel(config) self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True) self.init_weights() def get_output_embeddings(self): return self.lm_loss def prepare_inputs_for_generation(self, input_ids, past, **kwargs): # Add dummy token at the end (no attention on this one) effective_batch_size = input_ids.shape[0] dummy_token = torch.zeros((effective_batch_size, 1), dtype=torch.long, device=input_ids.device) input_ids = torch.cat([input_ids, dummy_token], dim=1) # Build permutation mask so that previous tokens don't see last token sequence_length = input_ids.shape[1] perm_mask = torch.zeros( (effective_batch_size, sequence_length, sequence_length), dtype=torch.float, device=input_ids.device ) perm_mask[:, :, -1] = 1.0 # We'll only predict the last token target_mapping = torch.zeros( (effective_batch_size, 1, sequence_length), dtype=torch.float, device=input_ids.device ) target_mapping[0, 0, -1] = 1.0 inputs = { "input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping, "use_cache": kwargs["use_cache"], } # if past is defined in model kwargs then use it for faster decoding if past: inputs["mems"] = past return inputs @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): Labels for masked language modeling. `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`. The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below) Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored, the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetLMHeadModel import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling. input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0) assert labels.shape[0] == 1, 'only one word will be predicted' perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels) loss, next_token_logits = outputs[:2] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) logits = self.lm_loss(transformer_outputs[0]) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it if labels is not None: # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XLNET_START_DOCSTRING, ) class XLNetForSequenceClassification(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.d_model, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`) Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForSequenceClassification import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLNET_START_DOCSTRING, ) class XLNetForTokenClassification(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`): Classification scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForTokenClassification import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) scores = outputs[0] """ outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RACE/SWAG tasks. """, XLNET_START_DOCSTRING, ) class XLNetForMultipleChoice(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.d_model, 1) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) def forward( self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, head_mask=None, inputs_embeds=None, use_cache=True, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForMultipleChoice import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None transformer_outputs = self.transformer( flat_input_ids, token_type_ids=flat_token_type_ids, input_mask=flat_input_mask, attention_mask=flat_attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + transformer_outputs[ 1: ] # Keep mems, hidden states, attentions if there are in it if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) @add_start_docstrings( """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLNET_START_DOCSTRING, ) class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForQuestionAnsweringSimple import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) @add_start_docstrings( """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLNET_START_DOCSTRING, ) class XLNetForQuestionAnswering(XLNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.start_n_top = config.start_n_top self.end_n_top = config.end_n_top self.transformer = XLNetModel(config) self.start_logits = PoolerStartLogits(config) self.end_logits = PoolerEndLogits(config) self.answer_class = PoolerAnswerClass(config) self.init_weights() @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, use_cache=True, start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels whether a question has an answer or no answer (SQuAD 2.0) cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for position (index) of the classification token to use as input for computing plausibility of the answer. p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be masked. 0.0 mean token is not masked. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers1.XLNetConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top config.start_n_top start token possibilities (beam-search). start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top config.start_n_top start token possibilities (beam-search). end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): Log probabilities for the ``is_impossible`` label of the answers. mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers1 import XLNetTokenizer, XLNetForQuestionAnswering import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] """ transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, token_type_ids=token_type_ids, input_mask=input_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, ) hidden_states = transformer_outputs[0] start_logits = self.start_logits(hidden_states, p_mask=p_mask) outputs = transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it if start_positions is not None and end_positions is not None: # If we are on multi-GPU, let's remove the dimension added by batch splitting for x in (start_positions, end_positions, cls_index, is_impossible): if x is not None and x.dim() > 1: x.squeeze_(-1) # during training, compute the end logits based on the ground truth of the start position end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) loss_fct = CrossEntropyLoss() start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if cls_index is not None and is_impossible is not None: # Predict answerability from the representation of CLS and START cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) loss_fct_cls = nn.BCEWithLogitsLoss() cls_loss = loss_fct_cls(cls_logits, is_impossible) # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 outputs = (total_loss,) + outputs else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) start_top_log_probs, start_top_index = torch.topk( start_log_probs, self.start_n_top, dim=-1 ) # shape (bsz, start_n_top) start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( start_states ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) end_top_log_probs, end_top_index = torch.topk( end_log_probs, self.end_n_top, dim=1 ) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) start_states = torch.einsum( "blh,bl->bh", hidden_states, start_log_probs ) # get the representation of START as weighted sum of hidden states cls_logits = self.answer_class( hidden_states, start_states=start_states, cls_index=cls_index ) # Shape (batch size,): one single `cls_logits` for each sample outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits # or (if labels are provided) (total_loss,) return outputs ================================================ FILE: code/bert-base-count5/pretrain/transformers1/optimization.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch optimization for BERT model.""" import logging import math import torch from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR logger = logging.getLogger(__name__) def get_constant_schedule(optimizer, last_epoch=-1): """ Create a schedule with a constant learning rate. """ return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1): """ Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate increases linearly between 0 and 1. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1.0, num_warmup_steps)) return 1.0 return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): """ Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) return max( 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) ) return LambdaLR(optimizer, lr_lambda, last_epoch) def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1): """ Create a schedule with a learning rate that decreases following the values of the cosine function between 0 and `pi * cycles` after a warmup period during which it increases linearly between 0 and 1. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) return LambdaLR(optimizer, lr_lambda, last_epoch) def get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1 ): """ Create a schedule with a learning rate that decreases following the values of the cosine function with several hard restarts, after a warmup period during which it increases linearly between 0 and 1. """ def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) if progress >= 1.0: return 0.0 return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) return LambdaLR(optimizer, lr_lambda, last_epoch) class AdamW(Optimizer): """ Implements Adam algorithm with weight decay fix. Parameters: lr (float): learning rate. Default 1e-3. betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) eps (float): Adams epsilon. Default: 1e-6 weight_decay (float): Weight decay. Default: 0.0 correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): if lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) super().__init__(params, defaults) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead") state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] beta1, beta2 = group["betas"] state["step"] += 1 # Decay the first and second moment running average coefficient # In-place operations to update the averages at the same time exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) denom = exp_avg_sq.sqrt().add_(group["eps"]) step_size = group["lr"] if group["correct_bias"]: # No bias correction for Bert bias_correction1 = 1.0 - beta1 ** state["step"] bias_correction2 = 1.0 - beta2 ** state["step"] step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(exp_avg, denom, value=-step_size) # Just adding the square of the weights to the loss function is *not* # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # # Instead we want to decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. # Add weight decay at the end (fixed version) if group["weight_decay"] > 0.0: p.data.add_(p.data, alpha=-group["lr"] * group["weight_decay"]) return loss ================================================ FILE: code/bert-base-count5/pretrain/transformers1/optimization_tf.py ================================================ # Copyright 2019 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Functions and classes related to optimization (weight updates).""" import re import tensorflow as tf class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): """Applies a warmup schedule on a given learning rate decay schedule.""" def __init__( self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None, ): super().__init__() self.initial_learning_rate = initial_learning_rate self.warmup_steps = warmup_steps self.power = power self.decay_schedule_fn = decay_schedule_fn self.name = name def __call__(self, step): with tf.name_scope(self.name or "WarmUp") as name: # Implements polynomial warmup. i.e., if global_step < warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. global_step_float = tf.cast(step, tf.float32) warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) warmup_percent_done = global_step_float / warmup_steps_float warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power) return tf.cond( global_step_float < warmup_steps_float, lambda: warmup_learning_rate, lambda: self.decay_schedule_fn(step), name=name, ) def get_config(self): return { "initial_learning_rate": self.initial_learning_rate, "decay_schedule_fn": self.decay_schedule_fn, "warmup_steps": self.warmup_steps, "power": self.power, "name": self.name, } def create_optimizer(init_lr, num_train_steps, num_warmup_steps, end_lr=0.0, optimizer_type="adamw"): """Creates an optimizer with learning rate schedule.""" # Implements linear decay of the learning rate. lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=end_lr, ) if num_warmup_steps: lr_schedule = WarmUp( initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps, ) optimizer = AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) return optimizer class AdamWeightDecay(tf.keras.optimizers.Adam): """Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact with the m and v parameters in strange ways. Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent to adding the square of the weights to the loss with plain (non-momentum) SGD. """ def __init__( self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, amsgrad=False, weight_decay_rate=0.0, include_in_weight_decay=None, exclude_from_weight_decay=None, name="AdamWeightDecay", **kwargs ): super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) self.weight_decay_rate = weight_decay_rate self._include_in_weight_decay = include_in_weight_decay self._exclude_from_weight_decay = exclude_from_weight_decay @classmethod def from_config(cls, config): """Creates an optimizer from its config with WarmUp custom object.""" custom_objects = {"WarmUp": WarmUp} return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects) def _prepare_local(self, var_device, var_dtype, apply_state): super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state) apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant( self.weight_decay_rate, name="adam_weight_decay_rate" ) def _decay_weights_op(self, var, learning_rate, apply_state): do_decay = self._do_use_weight_decay(var.name) if do_decay: return var.assign_sub( learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"], use_locking=self._use_locking, ) return tf.no_op() def apply_gradients(self, grads_and_vars, name=None): grads, tvars = list(zip(*grads_and_vars)) return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name,) def _get_lr(self, var_device, var_dtype, apply_state): """Retrieves the learning rate with the given state.""" if apply_state is None: return self._decayed_lr_t[var_dtype], {} apply_state = apply_state or {} coefficients = apply_state.get((var_device, var_dtype)) if coefficients is None: coefficients = self._fallback_apply_state(var_device, var_dtype) apply_state[(var_device, var_dtype)] = coefficients return coefficients["lr_t"], dict(apply_state=apply_state) def _resource_apply_dense(self, grad, var, apply_state=None): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs) def _resource_apply_sparse(self, grad, var, indices, apply_state=None): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs) def get_config(self): config = super().get_config() config.update({"weight_decay_rate": self.weight_decay_rate}) return config def _do_use_weight_decay(self, param_name): """Whether to use L2 weight decay for `param_name`.""" if self.weight_decay_rate == 0: return False if self._include_in_weight_decay: for r in self._include_in_weight_decay: if re.search(r, param_name) is not None: return True if self._exclude_from_weight_decay: for r in self._exclude_from_weight_decay: if re.search(r, param_name) is not None: return False return True # Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py class GradientAccumulator(object): """Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should then call ``.gradients``, scale the gradients if required, and pass the result to ``apply_gradients``. """ # We use the ON_READ synchronization policy so that no synchronization is # performed on assignment. To get the value, we call .value() which returns the # value on the current replica without synchronization. def __init__(self): """Initializes the accumulator.""" self._gradients = [] self._accum_steps = None @property def step(self): """Number of accumulated steps.""" if self._accum_steps is None: self._accum_steps = tf.Variable( tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) return self._accum_steps.value() @property def gradients(self): """The accumulated gradients on the current replica.""" if not self._gradients: raise ValueError("The accumulator should be called first to initialize the gradients") return list(gradient.value() if gradient is not None else gradient for gradient in self._gradients) def __call__(self, gradients): """Accumulates :obj:`gradients` on the current replica.""" if not self._gradients: _ = self.step # Create the step variable. self._gradients.extend( [ tf.Variable( tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) if gradient is not None else gradient for gradient in gradients ] ) if len(gradients) != len(self._gradients): raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) for accum_gradient, gradient in zip(self._gradients, gradients): if accum_gradient is not None and gradient is not None: accum_gradient.assign_add(gradient) self._accum_steps.assign_add(1) def reset(self): """Resets the accumulated gradients on the current replica.""" if not self._gradients: return self._accum_steps.assign(0) for gradient in self._gradients: if gradient is not None: gradient.assign(tf.zeros_like(gradient)) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/pipelines.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import csv import json import logging import os import pickle import sys from abc import ABC, abstractmethod from contextlib import contextmanager from itertools import chain from os.path import abspath, exists from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union import numpy as np from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig from .configuration_utils import PretrainedConfig from .data import SquadExample, squad_convert_examples_to_features from .file_utils import is_tf_available, is_torch_available from .modelcard import ModelCard from .tokenization_auto import AutoTokenizer from .tokenization_bert import BasicTokenizer from .tokenization_utils import PreTrainedTokenizer if is_tf_available(): import tensorflow as tf from .modeling_tf_auto import ( TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification, TFAutoModelWithLMHead, ) if is_torch_available(): import torch from .modeling_auto import ( AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelWithLMHead, ) if TYPE_CHECKING: from .modeling_utils import PreTrainedModel from .modeling_tf_utils import TFPreTrainedModel logger = logging.getLogger(__name__) def get_framework(model=None): """ Select framework (TensorFlow/PyTorch) to use. If both frameworks are installed and no specific model is provided, defaults to using PyTorch. """ if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): # Both framework are available but the user supplied a model class instance. # Try to guess which framework to use from the model classname framework = "tf" if model.__class__.__name__.startswith("TF") else "pt" elif not is_tf_available() and not is_torch_available(): raise RuntimeError( "At least one of TensorFlow 2.0 or PyTorch should be installed. " "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " "To install PyTorch, read the instructions at https://pytorch.org/." ) else: # framework = 'tf' if is_tf_available() else 'pt' framework = "pt" if is_torch_available() else "tf" return framework class ArgumentHandler(ABC): """ Base interface for handling varargs for each Pipeline """ @abstractmethod def __call__(self, *args, **kwargs): raise NotImplementedError() class DefaultArgumentHandler(ArgumentHandler): """ Default varargs argument parser handling parameters for each Pipeline """ @staticmethod def handle_kwargs(kwargs: Dict) -> List: if len(kwargs) == 1: output = list(kwargs.values()) else: output = list(chain(kwargs.values())) return DefaultArgumentHandler.handle_args(output) @staticmethod def handle_args(args: Sequence[Any]) -> List[str]: # Only one argument, let's do case by case if len(args) == 1: if isinstance(args[0], str): return [args[0]] elif not isinstance(args[0], list): return list(args) else: return args[0] # Multiple arguments (x1, x2, ...) elif len(args) > 1: if all([isinstance(arg, str) for arg in args]): return list(args) # If not instance of list, then it should instance of iterable elif isinstance(args, Iterable): return list(chain.from_iterable(chain(args))) else: raise ValueError( "Invalid input type {}. Pipeline supports Union[str, Iterable[str]]".format(type(args)) ) else: return [] def __call__(self, *args, **kwargs): if len(kwargs) > 0 and len(args) > 0: raise ValueError("Pipeline cannot handle mixed args and kwargs") if len(kwargs) > 0: return DefaultArgumentHandler.handle_kwargs(kwargs) else: return DefaultArgumentHandler.handle_args(args) class PipelineDataFormat: """ Base class for all the pipeline supported data format both for reading and writing. Supported data formats currently includes: - JSON - CSV - stdin/stdout (pipe) PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. """ SUPPORTED_FORMATS = ["json", "csv", "pipe"] def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): self.output_path = output_path self.input_path = input_path self.column = column.split(",") if column is not None else [""] self.is_multi_columns = len(self.column) > 1 if self.is_multi_columns: self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column] if output_path is not None and not overwrite: if exists(abspath(self.output_path)): raise OSError("{} already exists on disk".format(self.output_path)) if input_path is not None: if not exists(abspath(self.input_path)): raise OSError("{} doesnt exist on disk".format(self.input_path)) @abstractmethod def __iter__(self): raise NotImplementedError() @abstractmethod def save(self, data: dict): """ Save the provided data object with the representation for the current `DataFormat`. :param data: data to store :return: """ raise NotImplementedError() def save_binary(self, data: Union[dict, List[dict]]) -> str: """ Save the provided data object as a pickle-formatted binary data on the disk. :param data: data to store :return: (str) Path where the data has been saved """ path, _ = os.path.splitext(self.output_path) binary_path = os.path.extsep.join((path, "pickle")) with open(binary_path, "wb+") as f_output: pickle.dump(data, f_output) return binary_path @staticmethod def from_str( format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): if format == "json": return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) elif format == "csv": return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) elif format == "pipe": return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) else: raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format)) class CsvPipelineDataFormat(PipelineDataFormat): def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): super().__init__(output_path, input_path, column, overwrite=overwrite) def __iter__(self): with open(self.input_path, "r") as f: reader = csv.DictReader(f) for row in reader: if self.is_multi_columns: yield {k: row[c] for k, c in self.column} else: yield row[self.column[0]] def save(self, data: List[dict]): with open(self.output_path, "w") as f: if len(data) > 0: writer = csv.DictWriter(f, list(data[0].keys())) writer.writeheader() writer.writerows(data) class JsonPipelineDataFormat(PipelineDataFormat): def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): super().__init__(output_path, input_path, column, overwrite=overwrite) with open(input_path, "r") as f: self._entries = json.load(f) def __iter__(self): for entry in self._entries: if self.is_multi_columns: yield {k: entry[c] for k, c in self.column} else: yield entry[self.column[0]] def save(self, data: dict): with open(self.output_path, "w") as f: json.dump(data, f) class PipedPipelineDataFormat(PipelineDataFormat): """ Read data from piped input to the python process. For multi columns data, columns should separated by \t If columns are provided, then the output will be a dictionary with {column_x: value_x} """ def __iter__(self): for line in sys.stdin: # Split for multi-columns if "\t" in line: line = line.split("\t") if self.column: # Dictionary to map arguments yield {kwargs: l for (kwargs, _), l in zip(self.column, line)} else: yield tuple(line) # No dictionary to map arguments else: yield line def save(self, data: dict): print(data) def save_binary(self, data: Union[dict, List[dict]]) -> str: if self.output_path is None: raise KeyError( "When using piped input on pipeline outputting large object requires an output file path. " "Please provide such output path through --output argument." ) return super().save_binary(data) class _ScikitCompat(ABC): """ Interface layer for the Scikit and Keras compatibility. """ @abstractmethod def transform(self, X): raise NotImplementedError() @abstractmethod def predict(self, X): raise NotImplementedError() class Pipeline(_ScikitCompat): """ The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across different pipelines. Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following operations: Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output Pipeline supports running on CPU or GPU through the device argument. Users can specify device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal. Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large tensor object as nested-lists. In order to avoid dumping such large structure as textual data we provide the binary_output constructor argument. If set to True, the output will be stored in the pickle format. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`): Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text. Return: :obj:`List` or :obj:`Dict`: Pipeline returns list or dictionary depending on: - Whether the user supplied multiple samples - Whether the pipeline exposes multiple fields in the output object """ default_input_names = None def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, task: str = "", args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False, ): if framework is None: framework = get_framework() self.model = model self.tokenizer = tokenizer self.modelcard = modelcard self.framework = framework self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device)) self.binary_output = binary_output self._args_parser = args_parser or DefaultArgumentHandler() # Special handling if self.framework == "pt" and self.device.type == "cuda": self.model = self.model.to(self.device) # Update config with task specific parameters task_specific_params = self.model.config.task_specific_params if task_specific_params is not None and task in task_specific_params: self.model.config.update(task_specific_params.get(task)) def save_pretrained(self, save_directory): """ Save the pipeline's model and tokenizer to the specified save_directory """ if not os.path.isdir(save_directory): logger.error("Provided path ({}) should be a directory".format(save_directory)) return self.model.save_pretrained(save_directory) self.tokenizer.save_pretrained(save_directory) if self.modelcard is not None: self.modelcard.save_pretrained(save_directory) def transform(self, X): """ Scikit / Keras interface to transformers1' pipelines. This method will forward to __call__(). """ return self(X=X) def predict(self, X): """ Scikit / Keras interface to transformers1' pipelines. This method will forward to __call__(). """ return self(X=X) @contextmanager def device_placement(self): """ Context Manager allowing tensor allocation on the user-specified device in framework agnostic way. example: # Explicitly ask for tensor allocation on CUDA device :0 nlp = pipeline(..., device=0) with nlp.device_placement(): # Every framework specific tensor allocation will be done on the request device output = nlp(...) Returns: Context manager """ if self.framework == "tf": with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)): yield else: if self.device.type == "cuda": torch.cuda.set_device(self.device) yield def ensure_tensor_on_device(self, **inputs): """ Ensure PyTorch tensors are on the specified device. :param inputs: :return: """ return {name: tensor.to(self.device) for name, tensor in inputs.items()} def _parse_and_tokenize(self, *args, pad_to_max_length=True, add_special_tokens=True, **kwargs): """ Parse arguments and tokenize """ # Parse arguments inputs = self._args_parser(*args, **kwargs) inputs = self.tokenizer.batch_encode_plus( inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, pad_to_max_length=pad_to_max_length, ) return inputs def __call__(self, *args, **kwargs): inputs = self._parse_and_tokenize(*args, **kwargs) return self._forward(inputs) def _forward(self, inputs, return_tensors=False): """ Internal framework specific forward dispatching. Args: inputs: dict holding all the keyworded arguments for required by the model forward method. return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array. Returns: Numpy array """ # Encode for forward with self.device_placement(): if self.framework == "tf": # TODO trace model predictions = self.model(inputs.data, training=False)[0] else: with torch.no_grad(): inputs = self.ensure_tensor_on_device(**inputs) predictions = self.model(**inputs)[0].cpu() if return_tensors: return predictions else: return predictions.numpy() class FeatureExtractionPipeline(Pipeline): """ Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer, which can be used as features in downstream tasks. This feature extraction pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "feature-extraction", for extracting features of a sequence. All models may be used for this pipeline. See a list of all models, including community-contributed models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, task: str = "", ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=args_parser, device=device, binary_output=True, task=task, ) def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs).tolist() class TextGenerationPipeline(Pipeline): """ Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt. This language generation pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "text-generation", for generating text from a specified prompt. The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available community models on `huggingface.co/models `__. """ # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia # in https://github.com/rusiaaman/XLNet-gen#methodology # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision and denounces one of the men as a horse thief. Although his father initially slaps him for making such an accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing. """ ALLOWED_MODELS = [ "XLNetLMHeadModel", "TransfoXLLMHeadModel", "ReformerModelWithLMHead", "GPT2LMHeadModel", "OpenAIGPTLMHeadModel", "CTRLLMHeadModel", "TFXLNetLMHeadModel", "TFTransfoXLLMHeadModel", "TFGPT2LMHeadModel", "TFOpenAIGPTLMHeadModel", "TFCTRLLMHeadModel", ] def __call__( self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): if self.model.__class__.__name__ not in self.ALLOWED_MODELS: raise NotImplementedError( "Generation is currently not supported for {}. Please select a model from {} for generation.".format( self.model.__class__.__name__, self.ALLOWED_MODELS ) ) text_inputs = self._args_parser(*args) results = [] for prompt_text in text_inputs: # Manage correct placement of the tensors with self.device_placement(): if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]: inputs = self._parse_and_tokenize( self.PADDING_TEXT + prompt_text, pad_to_max_length=False, add_special_tokens=False ) else: inputs = self._parse_and_tokenize(prompt_text, pad_to_max_length=False, add_special_tokens=False) # set input_ids to None to allow empty prompt if inputs["input_ids"].shape[-1] == 0: inputs["input_ids"] = None inputs["attention_mask"] = None if self.framework == "pt" and inputs["input_ids"] is not None: inputs = self.ensure_tensor_on_device(**inputs) input_ids = inputs["input_ids"] # Ensure that batch size = 1 (batch generation not allowed for now) assert ( input_ids is None or input_ids.shape[0] == 1 ), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information." output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs) # BS x SL result = [] for generated_sequence in output_sequences: generated_sequence = generated_sequence.numpy().tolist() record = {} if return_tensors: record["generated_token_ids"] = generated_sequence if return_text: # Decode text text = self.tokenizer.decode( generated_sequence, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used if input_ids is None: prompt_length = 0 else: prompt_length = len( self.tokenizer.decode( input_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) ) record["generated_text"] = prompt_text + text[prompt_length:] result.append(record) results += [result] if len(results) == 1: return results[0] return results class TextClassificationPipeline(Pipeline): """ Text classification pipeline using ModelForSequenceClassification head. See the `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information. This text classification pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "sentiment-analysis", for classifying sequences according to positive or negative sentiments. The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __call__(self, *args, **kwargs): outputs = super().__call__(*args, **kwargs) scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) return [{"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores] class FillMaskPipeline(Pipeline): """ Masked language modeling prediction pipeline using ModelWithLMHead head. See the `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information. This mask filling pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "fill-mask", for predicting masked tokens in a sequence. The models that this pipeline can use are models that have been trained with a masked language modeling objective, which includes the bi-directional models in the library. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, topk=5, task: str = "", ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=args_parser, device=device, binary_output=True, task=task, ) self.topk = topk def __call__(self, *args, **kwargs): inputs = self._parse_and_tokenize(*args, **kwargs) outputs = self._forward(inputs, return_tensors=True) results = [] batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0) for i in range(batch_size): input_ids = inputs["input_ids"][i] result = [] if self.framework == "tf": masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item() logits = outputs[i, masked_index, :] probs = tf.nn.softmax(logits) topk = tf.math.top_k(probs, k=self.topk) values, predictions = topk.values.numpy(), topk.indices.numpy() else: masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item() logits = outputs[i, masked_index, :] probs = logits.softmax(dim=0) values, predictions = probs.topk(self.topk) for v, p in zip(values.tolist(), predictions.tolist()): tokens = input_ids.numpy() tokens[masked_index] = p # Filter padding out: tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] result.append({"sequence": self.tokenizer.decode(tokens), "score": v, "token": p}) # Append results += [result] if len(results) == 1: return results[0] return results class NerPipeline(Pipeline): """ Named Entity Recognition pipeline using ModelForTokenClassification head. See the `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information. This token recognition pipeline can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous. The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ default_input_names = "sequences" def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False, ignore_labels=["O"], task: str = "", grouped_entities: bool = False, ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=args_parser, device=device, binary_output=binary_output, task=task, ) self._basic_tokenizer = BasicTokenizer(do_lower_case=False) self.ignore_labels = ignore_labels self.grouped_entities = grouped_entities def __call__(self, *args, **kwargs): inputs = self._args_parser(*args, **kwargs) answers = [] for sentence in inputs: # Manage correct placement of the tensors with self.device_placement(): tokens = self.tokenizer.encode_plus( sentence, return_attention_mask=False, return_tensors=self.framework, max_length=self.tokenizer.max_len, ) # Forward if self.framework == "tf": entities = self.model(tokens.data)[0][0].numpy() input_ids = tokens["input_ids"].numpy()[0] else: with torch.no_grad(): tokens = self.ensure_tensor_on_device(**tokens) entities = self.model(**tokens)[0][0].cpu().numpy() input_ids = tokens["input_ids"].cpu().numpy()[0] score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) labels_idx = score.argmax(axis=-1) entities = [] entity_groups = [] entity_group_disagg = [] # Filter to labels not in `self.ignore_labels` filtered_labels_idx = [ (idx, label_idx) for idx, label_idx in enumerate(labels_idx) if self.model.config.id2label[label_idx] not in self.ignore_labels ] for idx, label_idx in filtered_labels_idx: entity = { "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])), "score": score[idx][label_idx].item(), "entity": self.model.config.id2label[label_idx], "index": idx, } last_idx, _ = filtered_labels_idx[-1] if self.grouped_entities: if not entity_group_disagg: entity_group_disagg += [entity] if idx == last_idx: entity_groups += [self.group_entities(entity_group_disagg)] continue # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group if ( entity["entity"] == entity_group_disagg[-1]["entity"] and entity["index"] == entity_group_disagg[-1]["index"] + 1 ): entity_group_disagg += [entity] # Group the entities at the last entity if idx == last_idx: entity_groups += [self.group_entities(entity_group_disagg)] # If the current entity is different from the previous entity, aggregate the disaggregated entity group else: entity_groups += [self.group_entities(entity_group_disagg)] entity_group_disagg = [entity] entities += [entity] # Append if self.grouped_entities: answers += [entity_groups] else: answers += [entities] if len(answers) == 1: return answers[0] return answers def group_entities(self, entities): """ Returns grouped entities """ # Get the last entity in the entity group entity = entities[-1]["entity"] scores = np.mean([entity["score"] for entity in entities]) tokens = [entity["word"] for entity in entities] entity_group = { "entity_group": entity, "score": np.mean(scores), "word": self.tokenizer.convert_tokens_to_string(tokens), } return entity_group TokenClassificationPipeline = NerPipeline class QuestionAnsweringArgumentHandler(ArgumentHandler): """ QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to internal SquadExample / SquadFeature structures. QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied arguments. """ def __call__(self, *args, **kwargs): # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating if args is not None and len(args) > 0: if len(args) == 1: kwargs["X"] = args[0] else: kwargs["X"] = list(args) # Generic compatibility with sklearn and Keras # Batched data if "X" in kwargs or "data" in kwargs: inputs = kwargs["X"] if "X" in kwargs else kwargs["data"] if isinstance(inputs, dict): inputs = [inputs] else: # Copy to avoid overriding arguments inputs = [i for i in inputs] for i, item in enumerate(inputs): if isinstance(item, dict): if any(k not in item for k in ["question", "context"]): raise KeyError("You need to provide a dictionary with keys {question:..., context:...}") inputs[i] = QuestionAnsweringPipeline.create_sample(**item) elif not isinstance(item, SquadExample): raise ValueError( "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format( "X" if "X" in kwargs else "data" ) ) # Tabular input elif "question" in kwargs and "context" in kwargs: if isinstance(kwargs["question"], str): kwargs["question"] = [kwargs["question"]] if isinstance(kwargs["context"], str): kwargs["context"] = [kwargs["context"]] inputs = [ QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"]) ] else: raise ValueError("Unknown arguments {}".format(kwargs)) if not isinstance(inputs, list): inputs = [inputs] return inputs class QuestionAnsweringPipeline(Pipeline): """ Question Answering pipeline using ModelForQuestionAnswering head. See the `question answering usage <../usage.html#question-answering>`__ examples for more information. This question answering can currently be loaded from the :func:`~transformers1.pipeline` method using the following task identifier(s): - "question-answering", for answering questions given a context. The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. tokenizer (:obj:`~transformers1.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers1.PreTrainedTokenizer`. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ default_input_names = "question,context" def __init__( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, device: int = -1, task: str = "", **kwargs ): super().__init__( model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, args_parser=QuestionAnsweringArgumentHandler(), device=device, task=task, **kwargs, ) @staticmethod def create_sample( question: Union[str, List[str]], context: Union[str, List[str]] ) -> Union[SquadExample, List[SquadExample]]: """ QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). We currently support extractive question answering. Arguments: question: (str, List[str]) The question to be ask for the associated context context: (str, List[str]) The context in which we will look for the answer. Returns: SquadExample initialized with the corresponding question and context. """ if isinstance(question, list): return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] else: return SquadExample(None, question, context, None, None, None) def __call__(self, *args, **kwargs): """ Args: We support multiple use-cases, the following are exclusive: X: sequence of SquadExample data: sequence of SquadExample question: (str, List[str]), batch of question(s) to map along with context context: (str, List[str]), batch of context(s) associated with the provided question keyword argument Returns: dict: {'answer': str, 'score": float, 'start": int, "end": int} answer: the textual answer in the intial context score: the score the current answer scored for the model start: the character index in the original string corresponding to the beginning of the answer' span end: the character index in the original string corresponding to the ending of the answer' span """ # Set defaults values kwargs.setdefault("topk", 1) kwargs.setdefault("doc_stride", 128) kwargs.setdefault("max_answer_len", 15) kwargs.setdefault("max_seq_len", 384) kwargs.setdefault("max_question_len", 64) kwargs.setdefault("handle_impossible_answer", False) if kwargs["topk"] < 1: raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"])) if kwargs["max_answer_len"] < 1: raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"])) # Convert inputs to features examples = self._args_parser(*args, **kwargs) features_list = [ squad_convert_examples_to_features( [example], self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False, tqdm_enabled=False, ) for example in examples ] all_answers = [] for features, example in zip(features_list, examples): model_input_names = self.tokenizer.model_input_names + ["input_ids"] fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names} # Manage tensor allocation on correct device with self.device_placement(): if self.framework == "tf": fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} start, end = self.model(fw_args) start, end = start.numpy(), end.numpy() else: with torch.no_grad(): # Retrieve the score for the context tokens only (removing question tokens) fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()} start, end = self.model(**fw_args) start, end = start.cpu().numpy(), end.cpu().numpy() min_null_score = 1000000 # large and positive answers = [] for (feature, start_, end_) in zip(features, start, end): # Normalize logits and spans to retrieve the answer start_ = np.exp(start_) / np.sum(np.exp(start_)) end_ = np.exp(end_) / np.sum(np.exp(end_)) # Mask padding and question start_, end_ = ( start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1), ) if kwargs["handle_impossible_answer"]: min_null_score = min(min_null_score, (start_[0] * end_[0]).item()) start_[0] = end_[0] = 0 starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text answers += [ { "score": score.item(), "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), "answer": " ".join( example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] ), } for s, e, score in zip(starts, ends, scores) ] if kwargs["handle_impossible_answer"]: answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]] all_answers += answers if len(all_answers) == 1: return all_answers[0] return all_answers def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: """ Take the output of any QuestionAnswering head and will generate probalities for each span to be the actual answer. In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or answer end position being before the starting position. The method supports output the k-best answer through the topk argument. Args: start: numpy array, holding individual start probabilities for each token end: numpy array, holding individual end probabilities for each token topk: int, indicates how many possible answer span(s) to extract from the model's output max_answer_len: int, maximum size of the answer to extract from the model's output """ # Ensure we have batch axis if start.ndim == 1: start = start[None] if end.ndim == 1: end = end[None] # Compute the score of each tuple(start, end) to be the real answer outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) # Remove candidate with end < start and end - start > max_answer_len candidates = np.tril(np.triu(outer), max_answer_len - 1) # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) scores_flat = candidates.flatten() if topk == 1: idx_sort = [np.argmax(scores_flat)] elif len(scores_flat) < topk: idx_sort = np.argsort(-scores_flat) else: idx = np.argpartition(-scores_flat, topk)[0:topk] idx_sort = idx[np.argsort(-scores_flat[idx])] start, end = np.unravel_index(idx_sort, candidates.shape)[1:] return start, end, candidates[0, start, end] def span_to_answer(self, text: str, start: int, end: int): """ When decoding from token probalities, this method maps token indexes to actual word in the initial context. Args: text: str, the actual context to extract the answer from start: int, starting answer token index end: int, ending answer token index Returns: dict: {'answer': str, 'start': int, 'end': int} """ words = [] token_idx = char_start_idx = char_end_idx = chars_idx = 0 for i, word in enumerate(text.split(" ")): token = self.tokenizer.tokenize(word) # Append words if they are in the span if start <= token_idx <= end: if token_idx == start: char_start_idx = chars_idx if token_idx == end: char_end_idx = chars_idx + len(word) words += [word] # Stop if we went over the end of the answer if token_idx > end: break # Append the subtokenization length to the running index token_idx += len(token) chars_idx += len(word) + 1 # Join text with spaces return { "answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx), } class SummarizationPipeline(Pipeline): """ Summarize news articles and other documents Usage:: # use bart in pytorch summarizer = pipeline("summarization") summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20) # use t5 in tf summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf") summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20) The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`str` or :obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. If :obj:`None`, the default of the pipeline will be loaded. tokenizer (:obj:`str` or :obj:`~transformers1.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained tokenizer inheriting from :class:`~transformers1.PreTrainedTokenizer`. If :obj:`None`, the default of the pipeline will be loaded. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __call__( self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): r""" Args: *documents: (list of strings) articles to be summarized return_text: (bool, default=True) whether to add a decoded "summary_text" to each result return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output **generate_kwargs: extra kwargs passed to `self.model.generate`_ Returns: list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize .. _`self.model.generate`: https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate """ assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" assert len(documents) > 0, "Please provide a document to summarize" if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__: raise NotImplementedError( "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`" ) prefix = self.model.config.prefix if self.model.config.prefix is not None else "" if isinstance(documents[0], list): assert ( self.tokenizer.pad_token_id is not None ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" documents = ([prefix + document for document in documents[0]],) pad_to_max_length = True elif isinstance(documents[0], str): documents = (prefix + documents[0],) pad_to_max_length = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( documents[0] ) ) with self.device_placement(): inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) input_length = inputs["input_ids"].shape[-1] elif self.framework == "tf": input_length = tf.shape(inputs["input_ids"])[-1].numpy() min_length = generate_kwargs.get("min_length", self.model.config.min_length) if input_length < min_length // 2: logger.warning( "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format( min_length, input_length ) ) max_length = generate_kwargs.get("max_length", self.model.config.max_length) if input_length < max_length: logger.warning( "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format( max_length, input_length ) ) summaries = self.model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs, ) results = [] for summary in summaries: record = {} if return_tensors: record["summary_token_ids"] = summary if return_text: record["summary_text"] = self.tokenizer.decode( summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) results.append(record) return results class TranslationPipeline(Pipeline): """ Translates from one language to another. Usage:: en_fr_translator = pipeline("translation_en_to_fr") en_fr_translator("How old are you?") The models that this pipeline can use are models that have been fine-tuned on a translation task, currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" See the up-to-date list of available models on `huggingface.co/models `__. Arguments: model (:obj:`str` or :obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. If :obj:`None`, the default of the pipeline will be loaded. tokenizer (:obj:`str` or :obj:`~transformers1.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, a string checkpoint identifier or an actual pre-trained tokenizer inheriting from :class:`~transformers1.PreTrainedTokenizer`. If :obj:`None`, the default of the pipeline will be loaded. modelcard (:obj:`str` or :class:`~transformers1.ModelCard`, `optional`, defaults to :obj:`None`): Model card attributed to the model for this pipeline. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. args_parser (:class:`~transformers1.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to :obj:`-1`): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model on the associated CUDA device id. """ def __call__( self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): r""" Args: *args: (list of strings) texts to be translated return_text: (bool, default=True) whether to add a decoded "translation_text" to each result return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result **generate_kwargs: extra kwargs passed to `self.model.generate`_ Returns: list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate .. _`self.model.generate`: https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate """ assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" prefix = self.model.config.prefix if self.model.config.prefix is not None else "" if isinstance(args[0], list): assert ( self.tokenizer.pad_token_id is not None ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" args = ([prefix + text for text in args[0]],) pad_to_max_length = True elif isinstance(args[0], str): args = (prefix + args[0],) pad_to_max_length = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( args[0] ) ) with self.device_placement(): inputs = self._parse_and_tokenize(*args, pad_to_max_length=pad_to_max_length) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) input_length = inputs["input_ids"].shape[-1] elif self.framework == "tf": input_length = tf.shape(inputs["input_ids"])[-1].numpy() max_length = generate_kwargs.get("max_length", self.model.config.max_length) if input_length > 0.9 * max_length: logger.warning( "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format( input_length, max_length ) ) translations = self.model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs, ) results = [] for translation in translations: record = {} if return_tensors: record["translation_token_ids"] = translation if return_text: record["translation_text"] = self.tokenizer.decode( translation, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) results.append(record) return results # Register all the supported tasks here SUPPORTED_TASKS = { "feature-extraction": { "impl": FeatureExtractionPipeline, "tf": TFAutoModel if is_tf_available() else None, "pt": AutoModel if is_torch_available() else None, "default": { "model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}, "config": None, "tokenizer": "distilbert-base-cased", }, }, "sentiment-analysis": { "impl": TextClassificationPipeline, "tf": TFAutoModelForSequenceClassification if is_tf_available() else None, "pt": AutoModelForSequenceClassification if is_torch_available() else None, "default": { "model": { "pt": "distilbert-base-uncased-finetuned-sst-2-english", "tf": "distilbert-base-uncased-finetuned-sst-2-english", }, "config": "distilbert-base-uncased-finetuned-sst-2-english", "tokenizer": "distilbert-base-uncased", }, }, "ner": { "impl": NerPipeline, "tf": TFAutoModelForTokenClassification if is_tf_available() else None, "pt": AutoModelForTokenClassification if is_torch_available() else None, "default": { "model": { "pt": "dbmdz/bert-large-cased-finetuned-conll03-english", "tf": "dbmdz/bert-large-cased-finetuned-conll03-english", }, "config": "dbmdz/bert-large-cased-finetuned-conll03-english", "tokenizer": "bert-large-cased", }, }, "question-answering": { "impl": QuestionAnsweringPipeline, "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None, "pt": AutoModelForQuestionAnswering if is_torch_available() else None, "default": { "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"}, "config": None, "tokenizer": ("distilbert-base-cased", {"use_fast": False}), }, }, "fill-mask": { "impl": FillMaskPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}, "config": None, "tokenizer": ("distilroberta-base", {"use_fast": False}), }, }, "summarization": { "impl": SummarizationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": {"model": {"pt": "facebook/bart-large-cnn", "tf": "t5-small"}, "config": None, "tokenizer": None}, }, "translation_en_to_fr": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "t5-base", "tf": "t5-base"}, "config": None, "tokenizer": ("t5-base", {"use_fast": False}), }, }, "translation_en_to_de": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "t5-base", "tf": "t5-base"}, "config": None, "tokenizer": ("t5-base", {"use_fast": False}), }, }, "translation_en_to_ro": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": { "model": {"pt": "t5-base", "tf": "t5-base"}, "config": None, "tokenizer": ("t5-base", {"use_fast": False}), }, }, "text-generation": { "impl": TextGenerationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, "default": {"model": {"pt": "gpt2", "tf": "gpt2"}, "config": None, "tokenizer": "gpt2"}, }, } def pipeline( task: str, model: Optional = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, framework: Optional[str] = None, **kwargs ) -> Pipeline: """ Utility factory method to build a pipeline. Pipeline are made of: - A Tokenizer instance in charge of mapping raw textual input to token - A Model instance - Some (optional) post processing for enhancing model's output Args: task (:obj:`str`): The task defining which pipeline will be returned. Currently accepted tasks are: - "feature-extraction": will return a :class:`~transformers1.FeatureExtractionPipeline` - "sentiment-analysis": will return a :class:`~transformers1.TextClassificationPipeline` - "ner": will return a :class:`~transformers1.NerPipeline` - "question-answering": will return a :class:`~transformers1.QuestionAnsweringPipeline` - "fill-mask": will return a :class:`~transformers1.FillMaskPipeline` - "summarization": will return a :class:`~transformers1.SummarizationPipeline` - "translation_xx_to_yy": will return a :class:`~transformers1.TranslationPipeline` model (:obj:`str` or :obj:`~transformers1.PreTrainedModel` or :obj:`~transformers1.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a model identifier or an actual pre-trained model inheriting from :class:`~transformers1.PreTrainedModel` for PyTorch and :class:`~transformers1.TFPreTrainedModel` for TensorFlow. If :obj:`None`, the default for this pipeline will be loaded. config (:obj:`str` or :obj:`~transformers1.PretrainedConfig`, `optional`, defaults to :obj:`None`): The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`, a model identifier or an actual pre-trained model configuration inheriting from :class:`~transformers1.PretrainedConfig`. If :obj:`None`, the default for this pipeline will be loaded. tokenizer (:obj:`str` or :obj:`~transformers1.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, a model identifier or an actual pre-trained tokenizer inheriting from :class:`~transformers1.PreTrainedTokenizer`. If :obj:`None`, the default for this pipeline will be loaded. framework (:obj:`str`, `optional`, defaults to :obj:`None`): The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to PyTorch. Returns: :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers1.Pipeline`, according to the task. Examples:: from transformers1 import pipeline, AutoModelForTokenClassification, AutoTokenizer # Sentiment analysis pipeline pipeline('sentiment-analysis') # Question answering pipeline, specifying the checkpoint identifier pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased') # Named entity recognition pipeline, passing in a specific model and tokenizer model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") pipeline('ner', model=model, tokenizer=tokenizer) """ # Retrieve the task if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) framework = framework or get_framework(model) targeted_task = SUPPORTED_TASKS[task] task_class, model_class = targeted_task["impl"], targeted_task[framework] # Use default model/config/tokenizer for the task if no model is provided if model is None: models, config, tokenizer = [targeted_task["default"][k] for k in ["model", "config", "tokenizer"]] model = models[framework] # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = model elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = config else: # Impossible to guest what is the right tokenizer here raise Exception( "Impossible to guess which tokenizer to use. " "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer." ) modelcard = None # Try to infer modelcard from model or config name (if provided as str) if isinstance(model, str): modelcard = model elif isinstance(config, str): modelcard = config # Instantiate tokenizer if needed if isinstance(tokenizer, (str, tuple)): if isinstance(tokenizer, tuple): # For tuple we have (tokenizer name, {kwargs}) tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1]) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer) # Instantiate config if needed if isinstance(config, str): config = AutoConfig.from_pretrained(config) # Instantiate modelcard if needed if isinstance(modelcard, str): modelcard = ModelCard.from_pretrained(modelcard) # Instantiate model if needed if isinstance(model, str): # Handle transparent TF/PT model conversion model_kwargs = {} if framework == "pt" and model.endswith(".h5"): model_kwargs["from_tf"] = True logger.warning( "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " "Trying to load the model with PyTorch." ) elif framework == "tf" and model.endswith(".bin"): model_kwargs["from_pt"] = True logger.warning( "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Trying to load the model with Tensorflow." ) model = model_class.from_pretrained(model, config=config, **model_kwargs) return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_albert.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for ALBERT model.""" import logging import os import unicodedata from shutil import copyfile from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-spiece.model", "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-spiece.model", "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-spiece.model", "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-spiece.model", "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model", "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model", "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model", "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "albert-base-v1": 512, "albert-large-v1": 512, "albert-xlarge-v1": 512, "albert-xxlarge-v1": 512, "albert-base-v2": 512, "albert-large-v2": 512, "albert-xlarge-v2": 512, "albert-xxlarge-v2": 512, } SPIECE_UNDERLINE = "▁" class AlbertTokenizer(PreTrainedTokenizer): """ Constructs an ALBERT tokenizer. Based on `SentencePiece `__ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a .spm extension) that contains the vocabulary necessary to instantiate a tokenizer. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to strip the text when tokenizing (removing excess spaces before and after the string). keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to keep accents when tokenizing. bos_token (:obj:`string`, `optional`, defaults to "[CLS]"): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to "[SEP]"): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=True, remove_space=True, keep_accents=False, bos_token="[CLS]", eos_token="[SEP]", unk_token="", sep_token="[SEP]", pad_token="", cls_token="[CLS]", mask_token="[MASK]", **kwargs ): super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return len(self.sp_model) def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): if self.remove_space: outputs = " ".join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if not self.keep_accents: outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if self.do_lower_case: outputs = outputs.lower() return outputs def _tokenize(self, text, sample=False): """ Tokenize a string. """ text = self.preprocess_text(text) if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: cur_pieces = cur_pieces[1:] else: cur_pieces[0] = cur_pieces[0][1:] cur_pieces.append(piece[-1]) new_pieces.extend(cur_pieces) else: new_pieces.append(piece) return new_pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An ALBERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return cls + token_ids_0 + sep return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_auto.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Auto Tokenizer class. """ import logging from collections import OrderedDict from .configuration_auto import ( AlbertConfig, AutoConfig, BartConfig, BertConfig, CamembertConfig, CTRLConfig, DistilBertConfig, ElectraConfig, FlaubertConfig, GPT2Config, LongformerConfig, OpenAIGPTConfig, ReformerConfig, RobertaConfig, T5Config, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLNetConfig, ) from .configuration_marian import MarianConfig from .configuration_utils import PretrainedConfig from .tokenization_albert import AlbertTokenizer from .tokenization_bart import BartTokenizer from .tokenization_bert import BertTokenizer, BertTokenizerFast from .tokenization_bert_japanese import BertJapaneseTokenizer from .tokenization_camembert import CamembertTokenizer from .tokenization_ctrl import CTRLTokenizer from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast from .tokenization_flaubert import FlaubertTokenizer from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_longformer import LongformerTokenizer from .tokenization_marian import MarianTokenizer from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_reformer import ReformerTokenizer from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_t5 import T5Tokenizer from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLTokenizerFast from .tokenization_xlm import XLMTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer from .tokenization_xlnet import XLNetTokenizer logger = logging.getLogger(__name__) TOKENIZER_MAPPING = OrderedDict( [ (T5Config, (T5Tokenizer, None)), (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)), (AlbertConfig, (AlbertTokenizer, None)), (CamembertConfig, (CamembertTokenizer, None)), (XLMRobertaConfig, (XLMRobertaTokenizer, None)), (MarianConfig, (MarianTokenizer, None)), (BartConfig, (BartTokenizer, None)), (LongformerConfig, (LongformerTokenizer, None)), (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)), (ReformerConfig, (ReformerTokenizer, None)), (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)), (BertConfig, (BertTokenizer, BertTokenizerFast)), (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)), (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)), (TransfoXLConfig, (TransfoXLTokenizer, TransfoXLTokenizerFast)), (XLNetConfig, (XLNetTokenizer, None)), (FlaubertConfig, (FlaubertTokenizer, None)), (XLMConfig, (XLMTokenizer, None)), (CTRLConfig, (CTRLTokenizer, None)), ] ) class AutoTokenizer: r""":class:`~transformers1.AutoTokenizer` is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` class method. The `from_pretrained()` method takes care of returning the correct tokenizer class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: T5Tokenizer (T5 model) - `distilbert`: DistilBertTokenizer (DistilBert model) - `albert`: AlbertTokenizer (ALBERT model) - `camembert`: CamembertTokenizer (CamemBERT model) - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model) - `longformer`: LongformerTokenizer (AllenAI Longformer model) - `roberta`: RobertaTokenizer (RoBERTa model) - `bert`: BertTokenizer (Bert model) - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - `xlnet`: XLNetTokenizer (XLNet model) - `xlm`: XLMTokenizer (XLM model) - `ctrl`: CTRLTokenizer (Salesforce CTRL model) - `electra`: ElectraTokenizer (Google ELECTRA model) This class cannot be instantiated using `__init__()` (throw an error). """ def __init__(self): raise EnvironmentError( "AutoTokenizer is designed to be instantiated " "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method." ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): r""" Instantiate one of the tokenizer classes of the library from a pre-trained model vocabulary. The tokenizer class to instantiate is selected based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: - `t5`: T5Tokenizer (T5 model) - `distilbert`: DistilBertTokenizer (DistilBert model) - `albert`: AlbertTokenizer (ALBERT model) - `camembert`: CamembertTokenizer (CamemBERT model) - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model) - `longformer`: LongformerTokenizer (AllenAI Longformer model) - `roberta`: RobertaTokenizer (RoBERTa model) - `bert-base-japanese`: BertJapaneseTokenizer (Bert model) - `bert`: BertTokenizer (Bert model) - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - `xlnet`: XLNetTokenizer (XLNet model) - `xlm`: XLMTokenizer (XLM model) - `ctrl`: CTRLTokenizer (Salesforce CTRL model) - `electra`: ElectraTokenizer (Google ELECTRA model) Params: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers1.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. cache_dir: (`optional`) string: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the vocabulary files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. use_fast: (`optional`) boolean, default False: Indicate if transformers1 should try to load the fast version of the tokenizer (True) or use the Python one (False). inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers1.PreTrainedTokenizer` for details. Examples:: # Download vocabulary from S3 and cache. tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 (user-uploaded) and cache. tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased') # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') """ config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) if "bert-base-japanese" in pretrained_model_name_or_path: return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) use_fast = kwargs.pop("use_fast", False) for config_class, (tokenizer_class_py, tokenizer_class_fast) in TOKENIZER_MAPPING.items(): if isinstance(config, config_class): if tokenizer_class_fast and use_fast: return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) else: return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) raise ValueError( "Unrecognized configuration class {} to build an AutoTokenizer.\n" "Model type should be one of {}.".format( config.__class__, ", ".join(c.__name__ for c in TOKENIZER_MAPPING.keys()) ) ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_bart.py ================================================ # coding=utf-8 # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from .tokenization_roberta import RobertaTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer logger = logging.getLogger(__name__) # vocab and merges same as roberta vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" _all_bart_models = [ "facebook/bart-large", "facebook/bart-large-mnli", "facebook/bart-large-cnn", "facebook/bart-large-xsum", ] class BartTokenizer(RobertaTokenizer): # merges and vocab same as Roberta max_model_input_sizes = {m: 1024 for m in _all_bart_models} pretrained_vocab_files_map = { "vocab_file": {m: vocab_url for m in _all_bart_models}, "merges_file": {m: merges_url for m in _all_bart_models}, } _all_mbart_models = ["facebook/mbart-large-en-ro"] SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model" class MBartTokenizer(XLMRobertaTokenizer): vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} max_model_input_sizes = {m: 1024 for m in _all_mbart_models} pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_bert.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes.""" import collections import logging import os import unicodedata from typing import List, Optional from tokenizers import BertWordPieceTokenizer from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "bert-base-uncased": 512, "bert-large-uncased": 512, "bert-base-cased": 512, "bert-large-cased": 512, "bert-base-multilingual-uncased": 512, "bert-base-multilingual-cased": 512, "bert-base-chinese": 512, "bert-base-german-cased": 512, "bert-large-uncased-whole-word-masking": 512, "bert-large-cased-whole-word-masking": 512, "bert-large-uncased-whole-word-masking-finetuned-squad": 512, "bert-large-cased-whole-word-masking-finetuned-squad": 512, "bert-base-cased-finetuned-mrpc": 512, "bert-base-german-dbmdz-cased": 512, "bert-base-german-dbmdz-uncased": 512, "TurkuNLP/bert-base-finnish-cased-v1": 512, "TurkuNLP/bert-base-finnish-uncased-v1": 512, "wietsedv/bert-base-dutch-cased": 512, } PRETRAINED_INIT_CONFIGURATION = { "bert-base-uncased": {"do_lower_case": True}, "bert-large-uncased": {"do_lower_case": True}, "bert-base-cased": {"do_lower_case": False}, "bert-large-cased": {"do_lower_case": False}, "bert-base-multilingual-uncased": {"do_lower_case": True}, "bert-base-multilingual-cased": {"do_lower_case": False}, "bert-base-chinese": {"do_lower_case": False}, "bert-base-german-cased": {"do_lower_case": False}, "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, "bert-large-cased-whole-word-masking": {"do_lower_case": False}, "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, "bert-base-german-dbmdz-cased": {"do_lower_case": False}, "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False}, "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True}, "wietsedv/bert-base-dutch-cased": {"do_lower_case": False}, } def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() with open(vocab_file, "r", encoding="utf-8") as reader: tokens = reader.readlines() for index, token in enumerate(tokens): token = token.rstrip("\n") vocab[token] = index return vocab def whitespace_tokenize(text): """Runs basic whitespace cleaning and splitting on a piece of text.""" text = text.strip() if not text: return [] tokens = text.split() return tokens class BertTokenizer(PreTrainedTokenizer): r""" Constructs a BERT tokenizer. Based on WordPiece. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): File containing the vocabulary. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to do basic tokenization before WordPiece. never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): List of tokens which will never be split during tokenization. Only has an effect when :obj:`do_basic_tokenize=True` unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs ): super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) ) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) @property def vocab_size(self): return len(self.vocab) def get_vocab(self): return dict(self.vocab, **self.added_tokens_encoder) def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) else: split_tokens = self.wordpiece_tokenizer.tokenize(text) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = " ".join(tokens).replace(" ##", "").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0's). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, vocab_path): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: vocab_path (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!".format(vocab_file) ) index = token_index writer.write(token + "\n") index += 1 return (vocab_file,) class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True): """ Constructs a BasicTokenizer. Args: **do_lower_case**: Whether to lower case the input. **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. **tokenize_chinese_chars**: (`optional`) boolean (default True) Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 """ if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = never_split self.tokenize_chinese_chars = tokenize_chinese_chars def tokenize(self, text, never_split=None): """ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. Args: **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. """ never_split = self.never_split + (never_split if never_split is not None else []) text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: if self.do_lower_case and token not in never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token, never_split)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens def _run_strip_accents(self, text): """Strips accents from a piece of text.""" text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output) def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" if never_split is not None and text in never_split: return [text] chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output] def _tokenize_chinese_chars(self, text): """Adds whitespace around any CJK character.""" output = [] for char in text: cp = ord(char) if self._is_chinese_char(cp): output.append(" ") output.append(char) output.append(" ") else: output.append(char) return "".join(output) def _is_chinese_char(self, cp): """Checks whether CP is the codepoint of a CJK character.""" # This defines a "chinese character" as anything in the CJK Unicode block: # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) # # Note that the CJK Unicode block is NOT all Japanese and Korean characters, # despite its name. The modern Korean Hangul alphabet is a different block, # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. if ( (cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) # or (cp >= 0x20000 and cp <= 0x2A6DF) # or (cp >= 0x2A700 and cp <= 0x2B73F) # or (cp >= 0x2B740 and cp <= 0x2B81F) # or (cp >= 0x2B820 and cp <= 0x2CEAF) # or (cp >= 0xF900 and cp <= 0xFAFF) or (cp >= 0x2F800 and cp <= 0x2FA1F) # ): # return True return False def _clean_text(self, text): """Performs invalid character removal and whitespace cleanup on text.""" output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xFFFD or _is_control(char): continue if _is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) class WordpieceTokenizer(object): """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): self.vocab = vocab self.unk_token = unk_token self.max_input_chars_per_word = max_input_chars_per_word def tokenize(self, text): """Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. For example: input = "unaffable" output = ["un", "##aff", "##able"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`. Returns: A list of wordpiece tokens. """ output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) if len(chars) > self.max_input_chars_per_word: output_tokens.append(self.unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < len(chars): end = len(chars) cur_substr = None while start < end: substr = "".join(chars[start:end]) if start > 0: substr = "##" + substr if substr in self.vocab: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) return output_tokens def _is_whitespace(char): """Checks whether `chars` is a whitespace character.""" # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def _is_punctuation(char): """Checks whether `chars` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False class BertTokenizerFast(PreTrainedTokenizerFast): r""" Constructs a "Fast" BERT tokenizer (backed by HuggingFace's `tokenizers` library). Bert tokenization is Based on WordPiece. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): File containing the vocabulary. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to clean the text before tokenization by removing any control characters and replacing all whitespaces by the classic one. tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=True, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", clean_text=True, tokenize_chinese_chars=True, strip_accents=True, wordpieces_prefix="##", **kwargs ): super().__init__( BertWordPieceTokenizer( vocab_file=vocab_file, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, clean_text=clean_text, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, wordpieces_prefix=wordpieces_prefix, ), unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) self.do_lower_case = do_lower_case def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] if token_ids_1: output += token_ids_1 + [self.sep_token_id] return output def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0's). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_bert_japanese.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes.""" import collections import logging import os import unicodedata from typing import Optional from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "cl-tohoku/bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/vocab.txt", "cl-tohoku/bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/vocab.txt", "cl-tohoku/bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/vocab.txt", "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "cl-tohoku/bert-base-japanese": 512, "cl-tohoku/bert-base-japanese-whole-word-masking": 512, "cl-tohoku/bert-base-japanese-char": 512, "cl-tohoku/bert-base-japanese-char-whole-word-masking": 512, } PRETRAINED_INIT_CONFIGURATION = { "cl-tohoku/bert-base-japanese": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", }, "cl-tohoku/bert-base-japanese-whole-word-masking": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", }, "cl-tohoku/bert-base-japanese-char": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "character", }, "cl-tohoku/bert-base-japanese-char-whole-word-masking": { "do_lower_case": False, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "character", }, } class BertJapaneseTokenizer(BertTokenizer): """BERT tokenizer for Japanese text""" vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, do_lower_case=False, do_word_tokenize=True, do_subword_tokenize=True, word_tokenizer_type="basic", subword_tokenizer_type="wordpiece", never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", mecab_kwargs=None, **kwargs ): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_word_tokenize**: (`optional`) boolean (default True) Whether to do word tokenization. **do_subword_tokenize**: (`optional`) boolean (default True) Whether to do subword tokenization. **word_tokenizer_type**: (`optional`) string (default "basic") Type of word tokenizer. **subword_tokenizer_type**: (`optional`) string (default "wordpiece") Type of subword tokenizer. **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None) """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) # ^^ We call the grandparent's init, not the parent's. if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) ) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_word_tokenize = do_word_tokenize if do_word_tokenize: if word_tokenizer_type == "basic": self.word_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False ) elif word_tokenizer_type == "mecab": self.word_tokenizer = MecabTokenizer( do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {}) ) else: raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type)) self.do_subword_tokenize = do_subword_tokenize if do_subword_tokenize: if subword_tokenizer_type == "wordpiece": self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) elif subword_tokenizer_type == "character": self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token) else: raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) def _tokenize(self, text): if self.do_word_tokenize: tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens) else: tokens = [text] if self.do_subword_tokenize: split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)] else: split_tokens = tokens return split_tokens class MecabTokenizer: """Runs basic tokenization with MeCab morphological parser.""" def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None): """Constructs a MecabTokenizer. Args: **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. **normalize_text**: (`optional`) boolean (default True) Whether to apply unicode normalization to text before tokenization. **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "") """ self.do_lower_case = do_lower_case self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text import MeCab self.mecab = MeCab.Tagger(mecab_option) if mecab_option is not None else MeCab.Tagger() def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" if self.normalize_text: text = unicodedata.normalize("NFKC", text) never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] mecab_output = self.mecab.parse(text) cursor = 0 for line in mecab_output.split("\n"): if line == "EOS": break token, _ = line.split("\t") token_start = text.index(token, cursor) token_end = token_start + len(token) if self.do_lower_case and token not in never_split: token = token.lower() tokens.append(token) cursor = token_end return tokens class CharacterTokenizer(object): """Runs Character tokenziation.""" def __init__(self, vocab, unk_token, normalize_text=True): """Constructs a CharacterTokenizer. Args: **vocab**: Vocabulary object. **unk_token**: str A special symbol for out-of-vocabulary token. **normalize_text**: (`optional`) boolean (default True) Whether to apply unicode normalization to text before tokenization. """ self.vocab = vocab self.unk_token = unk_token self.normalize_text = normalize_text def tokenize(self, text): """Tokenizes a piece of text into characters. For example: input = "apple" output = ["a", "p", "p", "l", "e"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`. Returns: A list of characters. """ if self.normalize_text: text = unicodedata.normalize("NFKC", text) output_tokens = [] for i, char in enumerate(text): if char not in self.vocab: output_tokens.append(self.unk_token) continue output_tokens.append(char) return output_tokens ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_camembert.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License """ Tokenization classes for Camembert model.""" import logging import os from shutil import copyfile from typing import List, Optional import sentencepiece as spm from .tokenization_utils import PreTrainedTokenizer from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "camembert-base": None, } SHARED_MODEL_IDENTIFIERS = [ # Load with # `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")` "Musixmatch/umberto-commoncrawl-cased-v1", "Musixmatch/umberto-wikipedia-uncased-v1", ] class CamembertTokenizer(PreTrainedTokenizer): """ Adapted from RobertaTokenizer and XLNetTokenizer SentencePiece based tokenizer. Peculiarities: - requires `SentencePiece `_ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", additional_special_tokens=["NOTUSED", "NOTUSED"], **kwargs ): super().__init__( max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs, ) self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual # sentencepiece vocabulary (this is the case for and self.fairseq_tokens_to_ids = {"NOTUSED": 0, "": 1, "NOTUSED": 2, "": 3} self.fairseq_offset = len(self.fairseq_tokens_to_ids) self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A CamemBERT sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] @property def vocab_size(self): return len(self.fairseq_tokens_to_ids) + len(self.sp_model) def _tokenize(self, text): return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] elif self.sp_model.PieceToId(token) == 0: # Convert sentence piece unk token to fairseq unk token index return self.unk_token_id return self.fairseq_offset + self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_ctrl.py ================================================ # coding=utf-8 # Copyright 2018 Salesforce and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for Salesforce CTRL.""" import json import logging import os import regex as re from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"}, "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "ctrl": 256, } CONTROL_CODES = { "Pregnancy": 168629, "Christianity": 7675, "Explain": 106423, "Fitness": 63440, "Saving": 63163, "Ask": 27171, "Ass": 95985, "Joke": 163509, "Questions": 45622, "Thoughts": 49605, "Retail": 52342, "Feminism": 164338, "Writing": 11992, "Atheism": 192263, "Netflix": 48616, "Computing": 39639, "Opinion": 43213, "Alone": 44967, "Funny": 58917, "Gaming": 40358, "Human": 4088, "India": 1331, "Joker": 77138, "Diet": 36206, "Legal": 11859, "Norman": 4939, "Tip": 72689, "Weight": 52343, "Movies": 46273, "Running": 23425, "Science": 2090, "Horror": 37793, "Confession": 60572, "Finance": 12250, "Politics": 16360, "Scary": 191985, "Support": 12654, "Technologies": 32516, "Teenage": 66160, "Event": 32769, "Learned": 67460, "Notion": 182770, "Wikipedia": 37583, "Books": 6665, "Extract": 76050, "Confessions": 102701, "Conspiracy": 75932, "Links": 63674, "Narcissus": 150425, "Relationship": 54766, "Relationships": 134796, "Reviews": 41671, "News": 4256, "Translation": 26820, "multilingual": 128406, } def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char pairs = set(pairs) return pairs class CTRLTokenizer(PreTrainedTokenizer): """ Constructs a CTRL tokenizer. Peculiarities: - Byte-Pair-Encoding This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES control_codes = CONTROL_CODES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): super().__init__(unk_token=unk_token, **kwargs) with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: merges = merges_handle.read().split("\n")[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token) word = tuple(list(word[:-1]) + [word[-1] + ""]) pairs = get_pairs(word) if not pairs: return token while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = "@@ ".join(word) word = word[:-4] self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] words = re.findall(r"\S+\n?", text) for token in words: split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)) # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens) # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) # return ''.join(tokens_generated_so_far) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_distilbert.py ================================================ # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for DistilBERT.""" import logging from .tokenization_bert import BertTokenizer, BertTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "distilbert-base-uncased": 512, "distilbert-base-uncased-distilled-squad": 512, "distilbert-base-cased": 512, "distilbert-base-cased-distilled-squad": 512, "distilbert-base-german-cased": 512, "distilbert-base-multilingual-cased": 512, } PRETRAINED_INIT_CONFIGURATION = { "distilbert-base-uncased": {"do_lower_case": True}, "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, "distilbert-base-cased": {"do_lower_case": False}, "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, "distilbert-base-german-cased": {"do_lower_case": False}, "distilbert-base-multilingual-cased": {"do_lower_case": False}, } class DistilBertTokenizer(BertTokenizer): r""" Constructs a DistilBertTokenizer. :class:`~transformers1.DistilBertTokenizer is identical to :class:`~transformers1.BertTokenizer` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizer` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION model_input_names = ["attention_mask"] class DistilBertTokenizerFast(BertTokenizerFast): r""" Constructs a "Fast" DistilBertTokenizer (backed by HuggingFace's `tokenizers` library). :class:`~transformers1.DistilBertTokenizerFast` is identical to :class:`~transformers1.BertTokenizerFast` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizerFast` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION model_input_names = ["attention_mask"] ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_electra.py ================================================ # coding=utf-8 # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .tokenization_bert import BertTokenizer, BertTokenizerFast VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt", "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt", "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt", "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt", "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt", "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/electra-small-generator": 512, "google/electra-base-generator": 512, "google/electra-large-generator": 512, "google/electra-small-discriminator": 512, "google/electra-base-discriminator": 512, "google/electra-large-discriminator": 512, } PRETRAINED_INIT_CONFIGURATION = { "google/electra-small-generator": {"do_lower_case": True}, "google/electra-base-generator": {"do_lower_case": True}, "google/electra-large-generator": {"do_lower_case": True}, "google/electra-small-discriminator": {"do_lower_case": True}, "google/electra-base-discriminator": {"do_lower_case": True}, "google/electra-large-discriminator": {"do_lower_case": True}, } class ElectraTokenizer(BertTokenizer): r""" Constructs an Electra tokenizer. :class:`~transformers1.ElectraTokenizer` is identical to :class:`~transformers1.BertTokenizer` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizer` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION class ElectraTokenizerFast(BertTokenizerFast): r""" Constructs a "Fast" Electra Fast tokenizer (backed by HuggingFace's `tokenizers` library). :class:`~transformers1.ElectraTokenizerFast` is identical to :class:`~transformers1.BertTokenizerFast` and runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers1.BertTokenizerFast` for usage examples and documentation concerning parameters. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_flaubert.py ================================================ # coding=utf-8 # Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for Flaubert, based on XLM.""" import logging import unicodedata import six from .tokenization_xlm import XLMTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json", "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json", "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json", "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json", }, "merges_file": { "flaubert/flaubert_small_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt", "flaubert/flaubert_base_uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt", "flaubert/flaubert_base_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt", "flaubert/flaubert_large_cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "flaubert/flaubert_small_cased": 512, "flaubert/flaubert_base_uncased": 512, "flaubert/flaubert_base_cased": 512, "flaubert/flaubert_large_cased": 512, } PRETRAINED_INIT_CONFIGURATION = { "flaubert/flaubert_small_cased": {"do_lowercase": False}, "flaubert/flaubert_base_uncased": {"do_lowercase": True}, "flaubert/flaubert_base_cased": {"do_lowercase": False}, "flaubert/flaubert_large_cased": {"do_lowercase": False}, } def convert_to_unicode(text): """ Converts `text` to Unicode (if it's not already), assuming UTF-8 input. """ # six_ensure_text is copied from https://github.com/benjaminp/six def six_ensure_text(s, encoding="utf-8", errors="strict"): if isinstance(s, six.binary_type): return s.decode(encoding, errors) elif isinstance(s, six.text_type): return s else: raise TypeError("not expecting type '%s'" % type(s)) return six_ensure_text(text, encoding="utf-8", errors="ignore") class FlaubertTokenizer(XLMTokenizer): """ BPE tokenizer for Flaubert - Moses preprocessing & tokenization - Normalize all inputs text - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ (ex: "__classify__") to a vocabulary - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) This tokenizer inherits from :class:`~transformers1.XLMTokenizer`. Please check the superclass for usage examples and documentation regarding arguments. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, do_lowercase=False, **kwargs): super().__init__(**kwargs) self.do_lowercase = do_lowercase self.do_lowercase_and_remove_accent = False def preprocess_text(self, text): text = text.replace("``", '"').replace("''", '"') text = convert_to_unicode(text) text = unicodedata.normalize("NFC", text) if self.do_lowercase: text = text.lower() return text def _tokenize(self, text, bypass_tokenizer=False): """ Tokenize a string given language code using Moses. Details of tokenization: - [sacremoses](https://github.com/alvations/sacremoses): port of Moses - Install with `pip install sacremoses` Args: - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE. Returns: List of tokens. """ lang = "fr" if lang and self.lang2id and lang not in self.lang2id: logger.error( "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model." ) if bypass_tokenizer: text = text.split() else: text = self.preprocess_text(text) text = self.moses_pipeline(text, lang=lang) text = self.moses_tokenize(text, lang=lang) split_tokens = [] for token in text: if token: split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_gpt2.py ================================================ # coding=utf-8 # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" import json import logging import os from functools import lru_cache import regex as re from tokenizers import ByteLevelBPETokenizer from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json", "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json", }, "merges_file": { "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt", "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "gpt2": 1024, "gpt2-medium": 1024, "gpt2-large": 1024, "gpt2-xl": 1024, "distilgpt2": 1024, } @lru_cache() def bytes_to_unicode(): """ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. """ bs = ( list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) ) cs = bs[:] n = 0 for b in range(2 ** 8): if b not in bs: bs.append(b) cs.append(2 ** 8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs class GPT2Tokenizer(PreTrainedTokenizer): """ GPT-2 BPE tokenizer. Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The beginning of sequence token. eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The end of sequence token. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, merges_file, errors="replace", unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs ): super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: bpe_merges = merges_handle.read().split("\n")[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_merges] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token) pairs = get_pairs(word) if not pairs: return token while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = " ".join(word) self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): token = "".join( self.byte_encoder[b] for b in token.encode("utf-8") ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ text = "".join(tokens) text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file def prepare_for_tokenization(self, text, **kwargs): if "add_prefix_space" in kwargs and kwargs["add_prefix_space"]: return " " + text return text class GPT2TokenizerFast(PreTrainedTokenizerFast): """ Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library). Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The beginning of sequence token. eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The end of sequence token. add_prefix_space (:obj:`bool`, `optional`, defaults to `False`): Whether to add a leading space to the first word. This allows to treat the leading word just as any other word. (GPT2 tokenizer detect beginning of words by the preceeding space) trim_offsets (:obj:`bool`, `optional`, defaults to `True`): Whether the post processing step should trim offsets to avoid including whitespaces. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, merges_file, unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", add_prefix_space=False, trim_offsets=True, **kwargs ): super().__init__( ByteLevelBPETokenizer( vocab_file=vocab_file, merges_file=merges_file, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets, ), bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs, ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_longformer.py ================================================ # coding=utf-8 # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast logger = logging.getLogger(__name__) # vocab and merges same as roberta vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" _all_longformer_models = [ "allenai/longformer-base-4096", "allenai/longformer-large-4096", "allenai/longformer-large-4096-finetuned-triviaqa", "allenai/longformer-base-4096-extra.pos.embd.only", "allenai/longformer-large-4096-extra.pos.embd.only", ] PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "allenai/longformer-base-4096": 4096, "allenai/longformer-large-4096": 4096, "allenai/longformer-large-4096-finetuned-triviaqa": 4096, "allenai/longformer-base-4096-extra.pos.embd.only": 4096, "allenai/longformer-large-4096-extra.pos.embd.only": 4096, } class LongformerTokenizer(RobertaTokenizer): # merges and vocab same as Roberta max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_vocab_files_map = { "vocab_file": {m: vocab_url for m in _all_longformer_models}, "merges_file": {m: merges_url for m in _all_longformer_models}, } class LongformerTokenizerFast(RobertaTokenizerFast): # merges and vocab same as Roberta max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_vocab_files_map = { "vocab_file": {m: vocab_url for m in _all_longformer_models}, "merges_file": {m: merges_url for m in _all_longformer_models}, } ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_marian.py ================================================ import json import re import warnings from pathlib import Path from shutil import copyfile from typing import Dict, List, Optional, Tuple, Union import sentencepiece from .file_utils import S3_BUCKET_PREFIX from .tokenization_utils import BatchEncoding, PreTrainedTokenizer vocab_files_names = { "source_spm": "source.spm", "target_spm": "target.spm", "vocab": "vocab.json", "tokenizer_config_file": "tokenizer_config.json", } MODEL_NAMES = ("opus-mt-en-de",) # TODO(SS): delete this, the only required constant is vocab_files_names PRETRAINED_VOCAB_FILES_MAP = { k: {m: f"{S3_BUCKET_PREFIX}/Helsinki-NLP/{m}/{fname}" for m in MODEL_NAMES} for k, fname in vocab_files_names.items() } # Example URL https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/vocab.json class MarianTokenizer(PreTrainedTokenizer): """Sentencepiece tokenizer for marian. Source and target languages have different SPM models. The logic is use the relevant source_spm or target_spm to encode txt as pieces, then look up each piece in a vocab dictionary. Examples:: from transformers1 import MarianTokenizer tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de') src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."] tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts) # keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]. # model(**batch) should work """ vocab_files_names = vocab_files_names pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = {m: 512 for m in MODEL_NAMES} model_input_names = ["attention_mask"] # actually attention_mask, decoder_attention_mask language_code_re = re.compile(">>.+<<") # type: re.Pattern def __init__( self, vocab=None, source_spm=None, target_spm=None, source_lang=None, target_lang=None, unk_token="", eos_token="", pad_token="", max_len=512, **kwargs, ): super().__init__( # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id max_len=max_len, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, **kwargs, ) self.encoder = load_json(vocab) if self.unk_token not in self.encoder: raise KeyError(" token must be in vocab") assert self.pad_token in self.encoder self.decoder = {v: k for k, v in self.encoder.items()} self.source_lang = source_lang self.target_lang = target_lang self.supported_language_codes: list = [k for k in self.encoder if k.startswith(">>") and k.endswith("<<")] self.spm_files = [source_spm, target_spm] # load SentencePiece model for pre-processing self.spm_source = load_spm(source_spm) self.spm_target = load_spm(target_spm) self.current_spm = self.spm_source # Multilingual target side: default to using first supported language code. self._setup_normalizer() def _setup_normalizer(self): try: from mosestokenizer import MosesPunctuationNormalizer self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang) except ImportError: warnings.warn("Recommended: pip install mosestokenizer") self.punc_normalizer = lambda x: x def normalize(self, x: str) -> str: """Cover moses empty string edge case. They return empty list for '' input!""" return self.punc_normalizer(x) if x else "" def _convert_token_to_id(self, token): return self.encoder.get(token, self.encoder[self.unk_token]) def remove_language_code(self, text: str): """Remove language codes like <> before sentencepiece""" match = self.language_code_re.match(text) code: list = [match.group(0)] if match else [] return code, self.language_code_re.sub("", text) def _tokenize(self, text: str) -> List[str]: code, text = self.remove_language_code(text) pieces = self.current_spm.EncodeAsPieces(text) return code + pieces def _convert_id_to_token(self, index: int) -> str: """Converts an index (integer) in a token (str) using the encoder.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens: List[str]) -> str: """Uses target language sentencepiece model""" return self.spm_target.DecodePieces(tokens) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: """Build model inputs from a sequence by appending eos_token_id.""" if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] # We don't expect to process pairs, but leave the pair logic for API consistency return token_ids_0 + token_ids_1 + [self.eos_token_id] def prepare_translation_batch( self, src_texts: List[str], tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, pad_to_max_length: bool = True, return_tensors: str = "pt", ) -> BatchEncoding: """Prepare model inputs for translation. For best performance, translate one sentence at a time. Arguments: src_texts: list of src language texts tgt_texts: list of tgt language texts max_length: (None) defer to config (1024 for mbart-large-en-ro) pad_to_max_length: (bool) return_tensors: (str) default "pt" returns pytorch tensors, pass None to return lists. Returns: BatchEncoding: with keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask] all shaped bs, seq_len. (BatchEncoding is a dict of string -> tensor or lists). If no tgt_text is specified, the only keys will be input_ids and attention_mask. """ if "" in src_texts: raise ValueError(f"found empty string in src_texts: {src_texts}") self.current_spm = self.spm_source src_texts = [self.normalize(t) for t in src_texts] # this does not appear to do much model_inputs: BatchEncoding = self.batch_encode_plus( src_texts, add_special_tokens=True, return_tensors=return_tensors, max_length=max_length, pad_to_max_length=pad_to_max_length, ) if tgt_texts is None: return model_inputs self.current_spm = self.spm_target decoder_inputs: BatchEncoding = self.batch_encode_plus( tgt_texts, add_special_tokens=True, return_tensors=return_tensors, max_length=max_length, pad_to_max_length=pad_to_max_length, ) for k, v in decoder_inputs.items(): model_inputs[f"decoder_{k}"] = v self.current_spm = self.spm_source return model_inputs @property def vocab_size(self) -> int: return len(self.encoder) def save_vocabulary(self, save_directory: str) -> Tuple[str]: """save vocab file to json and copy spm files from their original path.""" save_dir = Path(save_directory) assert save_dir.is_dir(), f"{save_directory} should be a directory" save_json(self.encoder, save_dir / self.vocab_files_names["vocab"]) for f in self.spm_files: dest_path = save_dir / Path(f).name if not dest_path.exists(): copyfile(f, save_dir / Path(f).name) return tuple(save_dir / f for f in self.vocab_files_names) def get_vocab(self) -> Dict: vocab = self.encoder.copy() vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self) -> Dict: state = self.__dict__.copy() state.update({k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer"]}) return state def __setstate__(self, d: Dict) -> None: self.__dict__ = d self.spm_source, self.spm_target = (load_spm(f) for f in self.spm_files) self.current_spm = self.spm_source self._setup_normalizer() def num_special_tokens_to_add(self, **unused): """Just EOS""" return 1 def _special_token_mask(self, seq): all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp all_special_ids.remove(self.unk_token_id) # is only sometimes special return [1 if x in all_special_ids else 0 for x in seq] def get_special_tokens_mask( self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False ) -> List[int]: """Get list where entries are [1] if a token is [eos] or [pad] else 0.""" if already_has_special_tokens: return self._special_token_mask(token_ids_0) elif token_ids_1 is None: return self._special_token_mask(token_ids_0) + [1] else: return self._special_token_mask(token_ids_0 + token_ids_1) + [1] def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: spm = sentencepiece.SentencePieceProcessor() spm.Load(path) return spm def save_json(data, path: str) -> None: with open(path, "w") as f: json.dump(data, f, indent=2) def load_json(path: str) -> Union[Dict, List]: with open(path, "r") as f: return json.load(f) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_openai.py ================================================ # coding=utf-8 # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" import json import logging import os import re from tokenizers import CharBPETokenizer from .tokenization_bert import BasicTokenizer from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"}, "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "openai-gpt": 512, } def get_pairs(word): """ Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length strings) """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs def text_standardize(text): """ fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization """ text = text.replace("—", "-") text = text.replace("–", "-") text = text.replace("―", "-") text = text.replace("…", "...") text = text.replace("´", "'") text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text) text = re.sub(r"\s*\n\s*", " \n ", text) text = re.sub(r"[^\S\n]+", " ", text) return text.strip() class OpenAIGPTTokenizer(PreTrainedTokenizer): """ BPE tokenizer. Peculiarities: - lower case all inputs - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): super().__init__(unk_token=unk_token, **kwargs) try: import ftfy from spacy.lang.en import English _nlp = English() self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) self.fix_text = None with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: merges = merges_handle.read().split("\n")[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + "",) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token + "" while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = " ".join(word) if word == "\n ": word = "\n" self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(" ")]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an id in a token (BPE) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = "".join(tokens).replace("", " ").strip() return out_string def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): """ Construct a "Fast" BPE tokenizer for OpenAI GPT (backed by HuggingFace's `tokenizers` library). Peculiarities: - lower case all inputs - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): kwargs.setdefault("unk_token", unk_token) super().__init__( CharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, lowercase=True), **kwargs, ) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_reformer.py ================================================ # coding=utf-8 # Copyright 2020 The Trax Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization class for model Reformer.""" import logging import os from shutil import copyfile from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) SPIECE_UNDERLINE = "▁" #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances #################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to pretrained vocabulary URL for all the model shortcut names. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.model" } } #################################################### # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/reformer-crime-and-punishment": 524288, } class ReformerTokenizer(PreTrainedTokenizer): """ Constructs an Reformer tokenizer. Based on `SentencePiece `__ . This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a `.spm` extension) that contains the vocabulary necessary to instantiate a tokenizer. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`): Additional special tokens used by the tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, eos_token="", unk_token="", pad_token="", additional_special_tokens=[], **kwargs ): super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use ReformerTokenizer:" "https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return self.sp_model.get_piece_size() def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use ReformerTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def _tokenize(self, text, sample=False): """ Take as input a string and return a list of strings (tokens) for words/sub-words """ if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) return pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) return token def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = self.sp_model.decode_pieces(tokens) return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_roberta.py ================================================ # coding=utf-8 # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for RoBERTa.""" import logging from typing import List, Optional from tokenizers import AddedToken from tokenizers.processors import RobertaProcessing from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json", "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", }, "merges_file": { "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt", "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "roberta-base": 512, "roberta-large": 512, "roberta-large-mnli": 512, "distilroberta-base": 512, "roberta-base-openai-detector": 512, "roberta-large-openai-detector": 512, } class RobertaTokenizer(GPT2Tokenizer): """ Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, merges_file, errors="replace", bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", **kwargs ): super().__init__( vocab_file=vocab_file, merges_file=merges_file, errors=errors, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A RoBERTa sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] def prepare_for_tokenization(self, text, add_special_tokens=False, **kwargs): if "add_prefix_space" in kwargs: add_prefix_space = kwargs["add_prefix_space"] else: add_prefix_space = add_special_tokens if add_prefix_space and not text[0].isspace(): text = " " + text return text class RobertaTokenizerFast(GPT2TokenizerFast): """ Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library). Peculiarities: - Byte-level Byte-Pair-Encoding - Requires a space to start the input string => the encoding methods should be called with the ``add_prefix_space`` flag set to ``True``. Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: :: tokenizer.decode(tokenizer.encode("Hello")) = " Hello" This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. merges_file (:obj:`str`): Path to the merges file. errors (:obj:`str`, `optional`, defaults to "replace"): Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode `__ for more information. unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The beginning of sequence token. eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`): The end of sequence token. add_prefix_space (:obj:`bool`, `optional`, defaults to `False`): Whether to add a leading space to the first word. This allows to treat the leading word just as any other word. (GPT2 tokenizer detect beginning of words by the preceeding space) trim_offsets (:obj:`bool`, `optional`, defaults to `True`): Whether the post processing step should trim offsets to avoid including whitespaces. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, merges_file, errors="replace", bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", add_prefix_space=True, trim_offsets=True, **kwargs ): kwargs.setdefault("pad_token", pad_token) kwargs.setdefault("sep_token", sep_token) kwargs.setdefault("cls_token", cls_token) kwargs.setdefault("mask_token", mask_token) super().__init__( vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets, **kwargs, ) self.backend_tokenizer._tokenizer.post_processor = RobertaProcessing( sep=(sep_token, self.sep_token_id), cls=(cls_token, self.cls_token_id), add_prefix_space=add_prefix_space, trim_offsets=trim_offsets, ) self.backend_tokenizer.add_special_tokens([kwargs["mask_token"]]) @PreTrainedTokenizer.mask_token.setter def mask_token(self, value): if not isinstance(value, AddedToken): value = AddedToken(value, lstrip=True) self._mask_token = str(value) self._maybe_update_backend([value]) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] if token_ids_1 is None: return output return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_t5.py ================================================ # coding=utf-8 # Copyright 2018 T5 Authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization class for model T5.""" import logging import os import re from shutil import copyfile from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) SPIECE_UNDERLINE = "▁" #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances #################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to pretrained vocabulary URL for all the model shortcut names. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", } } #################################################### # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, "t5-base": 512, "t5-large": 512, "t5-3b": 512, "t5-11b": 512, } class T5Tokenizer(PreTrainedTokenizer): """ Constructs an XLNet tokenizer. Based on `SentencePiece `__ . This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a `.spm` extension) that contains the vocabulary necessary to instantiate a tokenizer. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`): Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are accessible as "" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are indexed from the end of the vocabulary up to beginnning ("" is the last token in the vocabulary like in T5 preprocessing see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117) additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`): Additional special tokens used by the tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, eos_token="", unk_token="", pad_token="", extra_ids=100, additional_special_tokens=None, **kwargs ): # Add extra_ids to the special token list if extra_ids > 0: if additional_special_tokens is None: additional_special_tokens = [] additional_special_tokens.extend(["".format(i) for i in range(extra_ids)]) super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use T5Tokenizer:" "https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.vocab_file = vocab_file self._extra_ids = extra_ids self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return self.sp_model.get_piece_size() + self._extra_ids def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use T5Tokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def _tokenize(self, text, sample=False): """ Take as input a string and return a list of strings (tokens) for words/sub-words """ if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) return pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token.startswith("", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) else: token = "".format(self.vocab_size - 1 - index) return token def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = self.sp_model.decode_pieces(tokens) return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_transfo_xl.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. """ import glob import logging import os import pickle import re from collections import Counter, OrderedDict from typing import Optional import numpy as np from tokenizers import Tokenizer from tokenizers.implementations import BaseTokenizer from tokenizers.models import WordLevel from tokenizers.normalizers import Lowercase, Sequence, Strip, unicode_normalizer_from_str from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit from tokenizers.processors import BertProcessing from .file_utils import cached_path, is_torch_available from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast if is_torch_available(): import torch logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"} VOCAB_FILES_NAMES_FAST = {"pretrained_vocab_file": "vocab.json", "vocab_file": "vocab.json"} PRETRAINED_VOCAB_FILES_MAP = { "pretrained_vocab_file": { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin", } } PRETRAINED_VOCAB_FILES_MAP_FAST = { "pretrained_vocab_file": { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.json", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "transfo-xl-wt103": None, } PRETRAINED_CORPUS_ARCHIVE_MAP = { "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin", } CORPUS_NAME = "corpus.bin" class TransfoXLTokenizer(PreTrainedTokenizer): """ Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = [] def __init__( self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk_token="", eos_token="", additional_special_tokens=[""], **kwargs ): super().__init__( unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs ) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\:;<=>?@[\\]^_`{|}~' # noqa: W605 self.punction_without_space_before_pattern = re.compile(r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern() try: if pretrained_vocab_file is not None: # Hack because, honestly this tokenizer was not made to be used # in a library like ours, at all. vocab_dict = torch.load(pretrained_vocab_file) for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value if vocab_file is not None: self.build_vocab() except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format(pretrained_vocab_file) ) if vocab_file is not None: self.build_vocab() def _compile_space_around_punctuation_pattern(self): look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols) look_ahead_to_match_all_except_space = "(?=[^\s])" # noqa: W605 return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space) def count_file(self, path, verbose=False, add_eos=False): if verbose: logger.info("counting file {} ...".format(path)) assert os.path.exists(path) sents = [] with open(path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) symbols = self.tokenize(line, add_eos=add_eos) self.counter.update(symbols) sents.append(symbols) return sents def count_sents(self, sents, verbose=False): """ sents : a list of sentences, each a list of tokenized symbols """ if verbose: logger.info("counting {} sents ...".format(len(sents))) for idx, symbols in enumerate(sents): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) self.counter.update(symbols) def _build_from_file(self, vocab_file): self.idx2sym = [] self.sym2idx = OrderedDict() with open(vocab_file, "r", encoding="utf-8") as f: for line in f: symb = line.strip().split()[0] self.add_symbol(symb) if "" in self.sym2idx: self.unk_idx = self.sym2idx[""] elif "" in self.sym2idx: self.unk_idx = self.sym2idx[""] else: raise ValueError("No token in vocabulary") def save_vocabulary(self, vocab_path): """ Save the vocabulary and special tokens file to a directory. Args: vocab_path (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ logger.warning( "Please note you will not be able to load the save vocabulary in" " Rust-based TransfoXLTokenizerFast as they don't share the same structure." ) if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"]) else: vocab_file = vocab_path torch.save(self.__dict__, vocab_file) return (vocab_file,) def build_vocab(self): if self.vocab_file: logger.info("building vocab from {}".format(self.vocab_file)) self._build_from_file(self.vocab_file) logger.info("final vocab size {}".format(len(self))) else: logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size)) self.idx2sym = [] self.sym2idx = OrderedDict() for sym in self.special: self.add_special(sym) for sym, cnt in self.counter.most_common(self.max_size): if cnt < self.min_freq: break self.add_symbol(sym) logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter))) def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False): if verbose: logger.info("encoding file {} ...".format(path)) assert os.path.exists(path) encoded = [] with open(path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos) encoded.append(self.convert_to_tensor(symbols)) if ordered: encoded = torch.cat(encoded) return encoded def encode_sents(self, sents, ordered=False, verbose=False): if verbose: logger.info("encoding {} sents ...".format(len(sents))) encoded = [] for idx, symbols in enumerate(sents): if verbose and idx > 0 and idx % 500000 == 0: logger.info(" line {}".format(idx)) encoded.append(self.convert_to_tensor(symbols)) if ordered: encoded = torch.cat(encoded) return encoded def add_special(self, sym): if sym not in self.sym2idx: self.idx2sym.append(sym) self.sym2idx[sym] = len(self.idx2sym) - 1 setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym]) def add_symbol(self, sym): if sym not in self.sym2idx: self.idx2sym.append(sym) self.sym2idx[sym] = len(self.idx2sym) - 1 def _convert_id_to_token(self, idx): """Converts an id in a token (BPE) using the vocab.""" assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx) return self.idx2sym[idx] def _convert_token_to_id(self, sym): """ Converts a token (str) in an id using the vocab. """ if sym in self.sym2idx: return self.sym2idx[sym] else: # logger.info('encounter unk {}'.format(sym)) # assert '' not in sym if hasattr(self, "unk_idx"): return self.sym2idx.get(sym, self.unk_idx) # Backward compatibility with pre-trained models elif "" in self.sym2idx: return self.sym2idx[""] elif "" in self.sym2idx: return self.sym2idx[""] else: raise ValueError("Token not in vocabulary and no token in vocabulary for replacement") def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = " ".join(tokens).strip() return out_string def convert_to_tensor(self, symbols): return torch.LongTensor(self.convert_tokens_to_ids(symbols)) @property def vocab_size(self): return len(self.idx2sym) def get_vocab(self): return dict(self.sym2idx, **self.added_tokens_encoder) def _tokenize(self, line, add_eos=False, add_double_eos=False): line = line.strip() # convert to lower case if self.lower_case: line = line.lower() # empty delimiter '' will evaluate False if self.delimiter == "": symbols = line else: symbols = line.split(self.delimiter) if add_double_eos: # lm1b return [""] + symbols + [""] elif add_eos: return symbols + [""] else: return symbols def prepare_for_tokenization(self, text, **kwargs): # add spaces before punctuation symbols as should be done in transfo-xl if "add_space_before_punct_symbol" in kwargs and kwargs["add_space_before_punct_symbol"]: text = self.punctuation_with_space_around_pattern.sub(r" ", text) elif self.punction_without_space_before_pattern.search(text): # searches until the first occurence of a punctuation symbol without surrounding spaces logger.warning( "You might want to consider setting `add_space_before_punct_symbol=True` as an argument to the `tokenizer.encode()` to avoid tokenizing words with punctuation symbols to the `` token" ) return text class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer): def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file) ) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token)) ) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters) class TransfoXLTokenizerFast(PreTrainedTokenizerFast): """ Construct a "Fast" Transformer-XL tokenizer (backed by HuggingFace's `tokenizers` library). The Transformer-XL tokenizer is a word-level tokenizer (no sub-word tokenization). Adapted from Vocab class in https://github.com/kimiyoung/transformer-xl This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizerFast` which contains most of the methods. Users should refer to the superclass for more information regarding methods. """ vocab_files_names = VOCAB_FILES_NAMES_FAST pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = [] def __init__( self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk_token="", eos_token="", additional_special_tokens=[""], add_eos=False, add_double_eos=False, normalization=None, **kwargs ): super().__init__( _TransfoXLDelimiterLookupTokenizer( vocab_file=vocab_file or pretrained_vocab_file, delimiter=delimiter, lowercase=lower_case, unk_token=unk_token, eos_token=eos_token, add_eos=add_eos, add_double_eos=add_double_eos, normalization=normalization, ), unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs, ) def save_pretrained(self, save_directory): logger.warning( "Please note you will not be able to load the vocabulary in" " Python-based TransfoXLTokenizer as they don't share the same structure." ) return super().save_pretrained(save_directory) class LMOrderedIterator(object): def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): """ data -- LongTensor -- the LongTensor is strictly ordered """ self.bsz = bsz self.bptt = bptt self.ext_len = ext_len if ext_len is not None else 0 self.device = device # Work out how cleanly we can divide the dataset into bsz parts. self.n_step = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, self.n_step * bsz) # Evenly divide the data across the bsz batches. self.data = data.view(bsz, -1).t().contiguous().to(device) # Number of mini-batches self.n_batch = (self.n_step + self.bptt - 1) // self.bptt def get_batch(self, i, bptt=None): if bptt is None: bptt = self.bptt seq_len = min(bptt, self.data.size(0) - 1 - i) end_idx = i + seq_len beg_idx = max(0, i - self.ext_len) data = self.data[beg_idx:end_idx] target = self.data[i + 1 : i + 1 + seq_len] data_out = data.transpose(0, 1).contiguous().to(self.device) target_out = target.transpose(0, 1).contiguous().to(self.device) return data_out, target_out, seq_len def get_fixlen_iter(self, start=0): for i in range(start, self.data.size(0) - 1, self.bptt): yield self.get_batch(i) def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): max_len = self.bptt + max_deviation * std i = start while True: bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0 bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) data, target, seq_len = self.get_batch(i, bptt) i += seq_len yield data, target, seq_len if i >= self.data.size(0) - 2: break def __iter__(self): return self.get_fixlen_iter() class LMShuffledIterator(object): def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False): """ data -- list[LongTensor] -- there is no order among the LongTensors """ self.data = data self.bsz = bsz self.bptt = bptt self.ext_len = ext_len if ext_len is not None else 0 self.device = device self.shuffle = shuffle def get_sent_stream(self): # index iterator epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data))) # sentence iterator for idx in epoch_indices: yield self.data[idx] def stream_iterator(self, sent_stream): # streams for each data in the batch streams = [None] * self.bsz data = torch.LongTensor(self.bptt, self.bsz) target = torch.LongTensor(self.bptt, self.bsz) n_retain = 0 while True: # data : [n_retain+bptt x bsz] # target : [bptt x bsz] data[n_retain:].fill_(-1) target.fill_(-1) valid_batch = True for i in range(self.bsz): n_filled = 0 try: while n_filled < self.bptt: if streams[i] is None or len(streams[i]) <= 1: streams[i] = next(sent_stream) # number of new tokens to fill in n_new = min(len(streams[i]) - 1, self.bptt - n_filled) # first n_retain tokens are retained from last batch data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new] target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1] streams[i] = streams[i][n_new:] n_filled += n_new except StopIteration: valid_batch = False break if not valid_batch: return data_out = data.transpose(0, 1).contiguous().to(self.device) target_out = target.transpose(0, 1).contiguous().to(self.device) yield data_out, target_out, self.bptt n_retain = min(data.size(0), self.ext_len) if n_retain > 0: data[:n_retain] = data[-n_retain:] data.resize_(n_retain + self.bptt, data.size(1)) def __iter__(self): # sent_stream is an iterator sent_stream = self.get_sent_stream() for batch in self.stream_iterator(sent_stream): yield batch class LMMultiFileIterator(LMShuffledIterator): def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False): self.paths = paths self.vocab = vocab self.bsz = bsz self.bptt = bptt self.ext_len = ext_len if ext_len is not None else 0 self.device = device self.shuffle = shuffle def get_sent_stream(self, path): sents = self.vocab.encode_file(path, add_double_eos=True) if self.shuffle: np.random.shuffle(sents) sent_stream = iter(sents) return sent_stream def __iter__(self): if self.shuffle: np.random.shuffle(self.paths) for path in self.paths: # sent_stream is an iterator sent_stream = self.get_sent_stream(path) for batch in self.stream_iterator(sent_stream): yield batch class TransfoXLCorpus(object): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a pre-processed corpus. """ vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP: corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path] else: corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME) # redirect to the cache, if necessary try: resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Corpus '{}' was not found in corpus list ({}). " "We assumed '{}' was a path or url but couldn't find files {} " "at this path or url.".format( pretrained_model_name_or_path, ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, corpus_file, ) ) return None if resolved_corpus_file == corpus_file: logger.info("loading corpus file {}".format(corpus_file)) else: logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file)) # Instantiate tokenizer. corpus = cls(*inputs, **kwargs) corpus_dict = torch.load(resolved_corpus_file) for key, value in corpus_dict.items(): corpus.__dict__[key] = value corpus.vocab = vocab if corpus.train is not None: corpus.train = torch.tensor(corpus.train, dtype=torch.long) if corpus.valid is not None: corpus.valid = torch.tensor(corpus.valid, dtype=torch.long) if corpus.test is not None: corpus.test = torch.tensor(corpus.test, dtype=torch.long) return corpus def __init__(self, *args, **kwargs): self.vocab = TransfoXLTokenizer(*args, **kwargs) self.dataset = None self.train = None self.valid = None self.test = None def build_corpus(self, path, dataset): self.dataset = dataset if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "wt103": self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*", ) train_paths = glob.glob(train_path_pattern) # the vocab will load from file when build_vocab() is called self.vocab.build_vocab() if self.dataset in ["ptb", "wt2", "wt103"]: self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True) def get_iterator(self, split, *args, **kwargs): if split == "train": if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: data_iter = LMOrderedIterator(self.train, *args, **kwargs) elif self.dataset == "lm1b": kwargs["shuffle"] = True data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) elif split in ["valid", "test"]: data = self.valid if split == "valid" else self.test if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: data_iter = LMOrderedIterator(data, *args, **kwargs) elif self.dataset == "lm1b": data_iter = LMShuffledIterator(data, *args, **kwargs) return data_iter def get_lm_corpus(datadir, dataset): fn = os.path.join(datadir, "cache.pt") fn_pickle = os.path.join(datadir, "cache.pkl") if os.path.exists(fn): logger.info("Loading cached dataset...") corpus = torch.load(fn_pickle) elif os.path.exists(fn): logger.info("Loading cached dataset from pickle...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: logger.info("Producing dataset {}...".format(dataset)) kwargs = {} if dataset in ["wt103", "wt2"]: kwargs["special"] = [""] kwargs["lower_case"] = False elif dataset == "ptb": kwargs["special"] = [""] kwargs["lower_case"] = True elif dataset == "lm1b": kwargs["special"] = [] kwargs["lower_case"] = False kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt") elif dataset in ["enwik8", "text8"]: pass corpus = TransfoXLCorpus(datadir, dataset, **kwargs) torch.save(corpus, fn) return corpus ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_utils.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for python and fast tokenizers. Fast tokenizers are provided by HuggingFace's tokenizers library.""" import copy import functools import itertools import json import logging import operator import os import re import warnings from collections import UserDict, defaultdict from contextlib import contextmanager from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union from tokenizers import AddedToken as AddedTokenFast from tokenizers import Encoding as EncodingFast from tokenizers.decoders import Decoder as DecoderFast from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available, torch_required if is_tf_available(): import tensorflow as tf if is_torch_available(): import torch logger = logging.getLogger(__name__) SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER # Define type aliases and NamedTuples TextInput = str PreTokenizedInput = List[str] EncodedInput = List[int] TextInputPair = Tuple[str, str] PreTokenizedInputPair = Tuple[List[str], List[str]] EncodedInputPair = Tuple[List[int], List[int]] class CharSpan(NamedTuple): """ Character span in the original string Args: start: index of the first character in the original string end: index of the character following the last character in the original string """ start: int end: int class TokenSpan(NamedTuple): """ Token span in an encoded string (list of tokens) Args: start: index of the first token in the span end: index of the token following the last token in the span """ start: int end: int def flatten(x: Sequence): """ Flatten the provided (potentially nested) sequence Args: x (Sequence): Potentially nested sequence to flatten Returns: list: Flattened sequence """ return functools.reduce(operator.iconcat, x, []) @contextmanager def truncate_and_pad( tokenizer: BaseTokenizerFast, max_length: int, stride: int, strategy: str, pad_to_max_length: bool, padding_side: str, pad_token_id: int, pad_token_type_id: int, pad_token: str, ): """ This contextmanager is in charge of defining the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers library) and restore the tokenizer settings afterwards. This contextmanager assumes the provider tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a padding / truncation strategy before, then it will be reset to no padding/truncation when exiting the managed section. Args: tokenizer (BaseTokenizerFast): The tokenizer which will be used max_length (int): The maximum size of the sequence stride (int): The stride to use when handling overflow strategy (str): Overflowing logic to use pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length padding_side (str): "left" or "right" indicating the direction the output sequence will be padded pad_token_id (int): The integer representation of the padding token to use pad_token_type_id (int): The integer representation of the padding token type to use pad_token (str): The string representation of the padding token to use """ # Handle all the truncation and padding stuff if max_length is not None: tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy) if pad_to_max_length and (pad_token and pad_token_id >= 0): tokenizer.enable_padding( max_length=max_length, direction=padding_side, pad_id=pad_token_id, pad_type_id=pad_token_type_id, pad_token=pad_token, ) elif pad_to_max_length: logger.warning( "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n" "To remove this error, you can add a new pad token and then resize model embedding:\n" "\ttokenizer.pad_token = ''\n\tmodel.resize_token_embeddings(len(tokenizer))".format( pad_token, pad_token_id ) ) yield # TODO(morgan, anthony): once we have a simple way to serialize tokenizers maybe store and restore the state afterward # to avoid destructing the padding / truncation strategy as we do now. if max_length is not None: tokenizer.no_truncation() if pad_to_max_length and (pad_token and pad_token_id >= 0): tokenizer.no_padding() class BatchEncoding(UserDict): """ BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). This class is derived from a python Dictionary and can be used as a dictionnary. In addition, this class expose utility methods to map from word/char space to token space. Args: data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space the `EncodingFast` instance or list of instance (for batches) hold these informations. """ def __init__( self, data: Optional[Dict[str, Any]] = None, encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, ): super().__init__(data) if isinstance(encoding, EncodingFast): encoding = [encoding] self._encodings = encoding def __getitem__(self, item: Union[int, str]) -> EncodingFast: """ If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) If the key is an integer, get the EncodingFast for batch item with index `key` """ if isinstance(item, str): return self.data[item] elif self._encodings is not None: return self._encodings[item] else: raise KeyError( "Indexing with integers (to access backend Encoding for a given batch index) " "is not available when using Python based tokenizers" ) def __getattr__(self, item: str): return self.data[item] def keys(self): return self.data.keys() def values(self): return self.data.values() def items(self): return self.data.items() # After this point: # Extended properties and methods only available for fast (Rust-based) tokenizers # provided by HuggingFace tokenizers library. @property def encodings(self) -> Optional[List[EncodingFast]]: """ Return the list all encoding from the tokenization process Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer """ return self._encodings def tokens(self, batch_index: int = 0) -> List[int]: if not self._encodings: raise ValueError("tokens() is not available when using Python based tokenizers") return self._encodings[batch_index].tokens def words(self, batch_index: int = 0) -> List[Optional[int]]: if not self._encodings: raise ValueError("words() is not available when using Python based tokenizers") return self._encodings[batch_index].words def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: """ Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch. Can be called as: - self.token_to_word(token_index) if batch size is 1 - self.token_to_word(batch_index, token_index) if batch size is greater than 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_token_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the token in the sequence token_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the sequence. Returns: word_index (:obj:`int`): index of the word in the input sequence. """ if not self._encodings: raise ValueError("token_to_word() is not available when using Python based tokenizers") if token_index is not None: batch_index = batch_or_token_index else: batch_index = 0 token_index = batch_or_token_index if batch_index < 0: batch_index = self._batch_size + batch_index if token_index < 0: token_index = self._seq_len + token_index return self._encodings[batch_index].token_to_word(token_index) def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan: """ Get the encoded token span corresponding to a word in the sequence of the batch. Token spans are returned as a TokenSpan NamedTuple with: start: index of the first token end: index of the token following the last token Can be called as: - self.word_to_tokens(word_index) if batch size is 1 - self.word_to_tokens(batch_index, word_index) if batch size is greater or equal to 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_word_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of the word in the sequence word_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. Returns: token_span (:obj:`TokenSpan`): Span of tokens in the encoded sequence. TokenSpan are NamedTuple with: start: index of the first token end: index of the token following the last token """ if not self._encodings: raise ValueError("word_to_tokens() is not available when using Python based tokenizers") if word_index is not None: batch_index = batch_or_word_index else: batch_index = 0 word_index = batch_or_word_index if batch_index < 0: batch_index = self._batch_size + batch_index if word_index < 0: word_index = self._seq_len + word_index return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index))) def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: """ Get the character span corresponding to an encoded token in a sequence of the batch. Character spans are returned as a CharSpan NamedTuple with: start: index of the first character in the original string associated to the token end: index of the character following the last character in the original string associated to the token Can be called as: - self.token_to_chars(token_index) if batch size is 1 - self.token_to_chars(batch_index, token_index) if batch size is greater or equal to 1 Args: batch_or_token_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the token in the sequence token_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in the sequence. Returns: char_span (:obj:`CharSpan`): Span of characters in the original string. CharSpan are NamedTuple with: start: index of the first character in the original string end: index of the character following the last character in the original string """ if not self._encodings: raise ValueError("token_to_chars() is not available when using Python based tokenizers") if token_index is not None: batch_index = batch_or_token_index else: batch_index = 0 token_index = batch_or_token_index return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: """ Get the index of the token in the encoded output comprising a character in the original string for a sequence of the batch. Can be called as: - self.char_to_token(char_index) if batch size is 1 - self.char_to_token(batch_index, char_index) if batch size is greater or equal to 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_char_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the word in the sequence char_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. Returns: token_index (:obj:`int`): Index of the token. """ if not self._encodings: raise ValueError("char_to_token() is not available when using Python based tokenizers") if char_index is not None: batch_index = batch_or_char_index else: batch_index = 0 char_index = batch_or_char_index return self._encodings[batch_index].char_to_token(char_index) def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: """ Get the character span in the original string corresponding to given word in a sequence of the batch. Character spans are returned as a CharSpan NamedTuple with: start: index of the first character in the original string end: index of the character following the last character in the original string Can be called as: - self.word_to_chars(word_index) if batch size is 1 - self.word_to_chars(batch_index, word_index) if batch size is greater or equal to 1 Args: batch_or_word_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the word in the sequence word_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. Returns: char_span (:obj:`CharSpan` or :obj:`List[CharSpan]`): Span(s) of the associated character or characters in the string. CharSpan are NamedTuple with: start: index of the first character associated to the token in the original string end: index of the character following the last character associated to the token in the original string """ if not self._encodings: raise ValueError("word_to_chars() is not available when using Python based tokenizers") if word_index is not None: batch_index = batch_or_word_index else: batch_index = 0 word_index = batch_or_word_index return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: """ Get the word in the original string corresponding to a character in the original string of a sequence of the batch. Can be called as: - self.char_to_word(char_index) if batch size is 1 - self.char_to_word(batch_index, char_index) if batch size is greater than 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_char_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the character in the orginal string. char_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the orginal string. Returns: token_index (:obj:`int` or :obj:`List[int]`): Index or indices of the associated encoded token(s). """ if not self._encodings: raise ValueError("char_to_word() is not available when using Python based tokenizers") if char_index is not None: batch_index = batch_or_char_index else: batch_index = 0 char_index = batch_or_char_index return self._encodings[batch_index].char_to_word(char_index) @torch_required def to(self, device: str): """Send all values to device by calling v.to(device)""" self.data = {k: v.to(device) for k, v in self.data.items()} return self class SpecialTokensMixin: """ SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and handles specific behaviors related to special tokens. In particular, this class hold the attributes which can be used to directly access to these special tokens in a model-independant manner and allow to set and update the special tokens. """ SPECIAL_TOKENS_ATTRIBUTES = [ "bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token", "additional_special_tokens", ] def __init__(self, **kwargs): self._bos_token = None self._eos_token = None self._unk_token = None self._sep_token = None self._pad_token = None self._cls_token = None self._mask_token = None self._pad_token_type_id = 0 self._additional_special_tokens = [] for key, value in kwargs.items(): if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key == "additional_special_tokens": assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) setattr(self, key, value) elif isinstance(value, AddedTokenFast): setattr(self, key, str(value)) elif isinstance(value, str): setattr(self, key, value) else: raise TypeError( "special token {} has to be either str or AddedTokenFast but got: {}".format(key, type(value)) ) @property def bos_token(self): """ Beginning of sentence token (string). Log an error if used while not having been set. """ if self._bos_token is None: logger.error("Using bos_token, but it is not set yet.") return self._bos_token @property def eos_token(self): """ End of sentence token (string). Log an error if used while not having been set. """ if self._eos_token is None: logger.error("Using eos_token, but it is not set yet.") return self._eos_token @property def unk_token(self): """ Unknown token (string). Log an error if used while not having been set. """ if self._unk_token is None: logger.error("Using unk_token, but it is not set yet.") return self._unk_token @property def sep_token(self): """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ if self._sep_token is None: logger.error("Using sep_token, but it is not set yet.") return self._sep_token @property def pad_token(self): """ Padding token (string). Log an error if used while not having been set. """ if self._pad_token is None: logger.error("Using pad_token, but it is not set yet.") return self._pad_token @property def cls_token(self): """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ if self._cls_token is None: logger.error("Using cls_token, but it is not set yet.") return self._cls_token @property def mask_token(self): """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ if self._mask_token is None: logger.error("Using mask_token, but it is not set yet.") return self._mask_token @property def additional_special_tokens(self): """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ if self._additional_special_tokens is None: logger.error("Using additional_special_tokens, but it is not set yet.") return self._additional_special_tokens def _maybe_update_backend(self, value): """ To be overriden by derived class if a backend tokenizer has to be updated. """ pass @bos_token.setter def bos_token(self, value): self._bos_token = value self._maybe_update_backend([value]) @eos_token.setter def eos_token(self, value): self._eos_token = value self._maybe_update_backend([value]) @unk_token.setter def unk_token(self, value): self._unk_token = value self._maybe_update_backend([value]) @sep_token.setter def sep_token(self, value): self._sep_token = value self._maybe_update_backend([value]) @pad_token.setter def pad_token(self, value): self._pad_token = value self._maybe_update_backend([value]) @cls_token.setter def cls_token(self, value): self._cls_token = value self._maybe_update_backend([value]) @mask_token.setter def mask_token(self, value): self._mask_token = value self._maybe_update_backend([value]) @additional_special_tokens.setter def additional_special_tokens(self, value): self._additional_special_tokens = value self._maybe_update_backend(value) @property def bos_token_id(self): """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.bos_token) @property def eos_token_id(self): """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.eos_token) @property def unk_token_id(self): """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.unk_token) @property def sep_token_id(self): """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.sep_token) @property def pad_token_id(self): """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.pad_token) @property def pad_token_type_id(self): """ Id of the padding token type in the vocabulary.""" return self._pad_token_type_id @property def cls_token_id(self): """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.cls_token) @property def mask_token_id(self): """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.mask_token) @property def additional_special_tokens_ids(self): """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.additional_special_tokens) @property def special_tokens_map(self): """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their values ('', ''...) """ set_attr = {} for attr in self.SPECIAL_TOKENS_ATTRIBUTES: attr_value = getattr(self, "_" + attr) if attr_value: set_attr[attr] = attr_value return set_attr @property def all_special_tokens(self): """ List all the special tokens ('', ''...) mapped to class attributes (cls_token, unk_token...). """ all_toks = [] set_attr = self.special_tokens_map for attr_value in set_attr.values(): all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) all_toks = list(set(all_toks)) return all_toks @property def all_special_ids(self): """ List the vocabulary indices of the special tokens ('', ''...) mapped to class attributes (cls_token, unk_token...). """ all_toks = self.all_special_tokens all_ids = self.convert_tokens_to_ids(all_toks) return all_ids class PreTrainedTokenizer(SpecialTokensMixin): """ Base class for all tokenizers. Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). Class attributes (overridden by derived classes): - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method. Args: - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). no associated max_length can be found in ``max_model_input_sizes``. - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. Should be selected between ['right', 'left'] - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the model ("token_type_ids", "attention_mask"...). - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` """ vocab_files_names: Dict[str, str] = {} pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} max_model_input_sizes: Dict[str, int] = {} model_input_names: List[str] = ["token_type_ids", "attention_mask"] padding_side: str = "right" NO_PAD_TOKEN_FOR_BATCH_MSG = ( "No padding token is set for this model, therefore no batch can be made with uneven " "sequences. Set a padding token or adjust the lengths of the sequences building the " "batch so that every sequence is of the same length." ) UNEVEN_SEQUENCES_FOR_BATCH_MSG = ( "The sequences building the batch are not of the same size, no tensor " "can be built. Set `pad_to_max_length=True` to pad the smaller sequences" "up to the larger sequence's length." ) @property def vocab_size(self) -> int: """ Size of the base vocabulary (without the added tokens) """ raise NotImplementedError @property def is_fast(self) -> bool: return False @property def max_len(self) -> int: """ Kept here for backward compatibility. Now renamed to `model_max_length` to avoid ambiguity. """ return self.model_max_length @property def max_len_single_sentence(self) -> int: return self.model_max_length - self.num_special_tokens_to_add(pair=False) @property def max_len_sentences_pair(self) -> int: return self.model_max_length - self.num_special_tokens_to_add(pair=True) @max_len_single_sentence.setter def max_len_single_sentence(self, value) -> int: """ For backward compatibility, allow to try to setup 'max_len_single_sentence' """ if value == self.model_max_length - self.num_special_tokens_to_add(pair=False): logger.warning( "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." ) else: raise ValueError( "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." ) @max_len_sentences_pair.setter def max_len_sentences_pair(self, value) -> int: """ For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ if value == self.model_max_length - self.num_special_tokens_to_add(pair=True): logger.warning( "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." ) else: raise ValueError( "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." ) def get_vocab(self): """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """ raise NotImplementedError() def __init__(self, model_max_length=None, **kwargs): super().__init__(**kwargs) # For backward compatibility we fallback to set model_max_length from max_len if provided if "max_len" in kwargs: warnings.warn( "Parameter max_len is deprecated and will be removed in a future release. " "Use model_max_length instead.", category=FutureWarning, ) model_max_length = kwargs.pop("max_len") self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed. self.padding_side = kwargs.pop("padding_side", self.padding_side) assert self.padding_side in [ "right", "left", ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) # Added tokens self.added_tokens_encoder = {} self.unique_added_tokens_encoder = set() self.added_tokens_decoder = {} # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) self.init_inputs = () self.init_kwargs = {} def __len__(self): """ Size of the full vocabulary with the added tokens """ return self.vocab_size + len(self.added_tokens_encoder) @classmethod def from_pretrained(cls, *inputs, **kwargs): r""" Instantiate a :class:`~transformers1.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. Args: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers1.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. cache_dir: (`optional`) string: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the vocabulary files and override the cached versions if they exists. resume_download: (`optional`) boolean, default False: Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers1.PreTrainedTokenizer` for details. Examples:: # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer # Download vocabulary from S3 and cache. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 (user-uploaded) and cache. tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') # If the tokenizer uses a single vocabulary file, you can point directly to this file tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') # You can link tokens to special vocabulary when instantiating tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') # You should be sure '' is in the vocabulary when doing that. # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) assert tokenizer.unk_token == '' """ return cls._from_pretrained(*inputs, **kwargs) @classmethod def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} init_configuration = {} if pretrained_model_name_or_path in s3_models: # Get the vocabulary from AWS S3 bucket for file_id, map_list in cls.pretrained_vocab_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] if ( cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration ): init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() else: # Get the vocabulary from local files logger.info( "Model name '{}' not found in model shortcut name list ({}). " "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path ) ) if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): if len(cls.vocab_files_names) > 1: raise ValueError( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not supported." "Use a model identifier or the path to a directory instead." ) logger.warning( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated" ) file_id = list(cls.vocab_files_names.keys())[0] vocab_files[file_id] = pretrained_model_name_or_path else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, } # Look for the tokenizer main vocabulary files + the additional tokens files for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): if os.path.isdir(pretrained_model_name_or_path): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) if not os.path.exists(full_file_name): logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) full_file_name = None else: full_file_name = hf_bucket_url( pretrained_model_name_or_path, filename=file_name, use_cdn=False ) vocab_files[file_id] = full_file_name # Get files from url, cache, or disk depending on the case try: resolved_vocab_files = {} for file_id, file_path in vocab_files.items(): if file_path is None: resolved_vocab_files[file_id] = None else: resolved_vocab_files[file_id] = cached_path( file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) except EnvironmentError: if pretrained_model_name_or_path in s3_models: msg = "Couldn't reach server at '{}' to download vocabulary files." else: msg = ( "Model name '{}' was not found in tokenizers model name list ({}). " "We assumed '{}' was a path or url to a directory containing vocabulary files " "named {}, but couldn't find such vocabulary files at this path or url.".format( pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path, list(cls.vocab_files_names.values()), ) ) raise EnvironmentError(msg) if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): raise EnvironmentError( "Model name '{}' was not found in tokenizers model name list ({}). " "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " "named {} but couldn't find such vocabulary files at this path or url.".format( pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path, list(cls.vocab_files_names.values()), ) ) for file_id, file_path in vocab_files.items(): if file_path == resolved_vocab_files[file_id]: logger.info("loading file {}".format(file_path)) else: logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) if tokenizer_config_file is not None: with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: init_kwargs = json.load(tokenizer_config_handle) saved_init_inputs = init_kwargs.pop("init_inputs", ()) if not init_inputs: init_inputs = saved_init_inputs else: init_kwargs = init_configuration # Update with newly provided kwargs init_kwargs.update(kwargs) # Set max length if needed if pretrained_model_name_or_path in cls.max_model_input_sizes: # if we're using a pretrained model, ensure the tokenizer # wont index sequences longer than the number of positional embeddings model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] if model_max_length is not None and isinstance(model_max_length, (int, float)): init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path if special_tokens_map_file is not None: with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: special_tokens_map = json.load(special_tokens_map_handle) for key, value in special_tokens_map.items(): if key not in init_kwargs: init_kwargs[key] = value # Instantiate tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) except OSError: raise OSError( "Unable to load vocabulary from file. " "Please check that the provided vocabulary is accessible and not corrupted." ) # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` tokenizer.init_inputs = init_inputs tokenizer.init_kwargs = init_kwargs # update unique_added_tokens_encoder with special tokens for correct tokenization tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens)) # Add supplementary tokens. if added_tokens_file is not None: with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} tokenizer.added_tokens_encoder.update(added_tok_encoder) tokenizer.added_tokens_decoder.update(added_tok_decoder) tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys())) return tokenizer def save_pretrained(self, save_directory): """ Save the tokenizer vocabulary files together with: - added tokens, - special-tokens-to-class-attributes-mapping, - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). Warning: This won't save modifications you may have applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation). This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers1.PreTrainedTokenizer.from_pretrained` class method. """ if not os.path.isdir(save_directory): logger.error("Saving directory ({}) should be a directory".format(save_directory)) return special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) tokenizer_config = copy.deepcopy(self.init_kwargs) if len(self.init_inputs) > 0: tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) for file_id in self.vocab_files_names.keys(): tokenizer_config.pop(file_id, None) with open(tokenizer_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(tokenizer_config, ensure_ascii=False)) with open(special_tokens_map_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) if len(self.added_tokens_encoder) > 0: with open(added_tokens_file, "w", encoding="utf-8") as f: out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) f.write(out_str) vocab_files = self.save_vocabulary(save_directory) return vocab_files + (special_tokens_map_file, added_tokens_file) def save_vocabulary(self, save_directory) -> Tuple[str]: """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens and special token mappings. Please use :func:`~transformers1.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers1.PreTrainedTokenizer.from_pretrained` class method. """ raise NotImplementedError def add_tokens(self, new_tokens: Union[str, List[str]]) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Args: new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). Returns: Number of tokens added to the vocabulary. Examples:: # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. """ if not new_tokens: return 0 if not isinstance(new_tokens, list): new_tokens = [new_tokens] tokens_to_add = [] for token in new_tokens: assert isinstance(token, str) if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens: token = token.lower() if ( token != self.unk_token and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and token not in tokens_to_add ): tokens_to_add.append(token) logger.info("Adding %s to the vocabulary", token) added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} self.added_tokens_encoder.update(added_tok_encoder) self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens)) self.added_tokens_decoder.update(added_tok_decoder) return len(tokens_to_add) def num_special_tokens_to_add(self, pair=False): """ Returns the number of added tokens when encoding a sequence with special tokens. Note: This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. Args: pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the number of added tokens in the case of a single sequence if set to False. Returns: Number of tokens added to sequences """ token_ids_0 = [] token_ids_1 = [] return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) def add_special_tokens(self, special_tokens_dict): """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them to class attributes. If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary). Using `add_special_tokens` will ensure your special tokens can be used in several ways: - special tokens are carefully handled by the tokenizer (they are never split) - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') Args: special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``]. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). Returns: Number of tokens added to the vocabulary. Examples:: # Let's see how to add a new classification token to GPT-2 tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2') special_tokens_dict = {'cls_token': ''} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. assert tokenizer.cls_token == '' """ if not special_tokens_dict: return 0 added_tokens = 0 for key, value in special_tokens_dict.items(): assert key in self.SPECIAL_TOKENS_ATTRIBUTES if key == "additional_special_tokens": assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) added_tokens += self.add_tokens(value) else: assert isinstance(value, str) added_tokens += self.add_tokens([value]) logger.info("Assigning %s to the %s key of the tokenizer", value, key) setattr(self, key, value) return added_tokens def tokenize(self, text: TextInput, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Take care of added tokens. Args: text (:obj:`string`): The sequence to be encoded. **kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method. """ all_special_tokens = self.all_special_tokens text = self.prepare_for_tokenization(text, **kwargs) # TODO: should this be in the base class? def lowercase_text(t): # convert non-special tokens to lowercase escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t) if self.init_kwargs.get("do_lower_case", False): text = lowercase_text(text) def split_on_token(tok, text): result = [] split_text = text.split(tok) for i, sub_text in enumerate(split_text): sub_text = sub_text.rstrip() if i == 0 and not sub_text: result += [tok] elif i == len(split_text) - 1: if sub_text: result += [sub_text] else: pass else: if sub_text: result += [sub_text] result += [tok] return result def split_on_tokens(tok_list, text): if not text.strip(): return [] if not tok_list: return self._tokenize(text) tokenized_text = [] text_list = [text] for tok in tok_list: tokenized_text = [] for sub_text in text_list: if sub_text not in self.unique_added_tokens_encoder: tokenized_text += split_on_token(tok, sub_text) else: tokenized_text += [sub_text] text_list = tokenized_text return list( itertools.chain.from_iterable( ( self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token] for token in tokenized_text ) ) ) added_tokens = self.unique_added_tokens_encoder tokenized_text = split_on_tokens(added_tokens, text) return tokenized_text def _tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Do NOT take care of added tokens. """ raise NotImplementedError def convert_tokens_to_ids(self, tokens): """ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. """ if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) ids = [] for token in tokens: ids.append(self._convert_token_to_id_with_added_voc(token)) return ids def _convert_token_to_id_with_added_voc(self, token): if token is None: return None if token in self.added_tokens_encoder: return self.added_tokens_encoder[token] return self._convert_token_to_id(token) def _convert_token_to_id(self, token): raise NotImplementedError def encode( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, return_tensors: Optional[str] = None, **kwargs ): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Args: text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): If set to ``True``, the sequences will be encoded with the special tokens relative to their model. max_length (:obj:`int`, `optional`, defaults to :obj:`None`): If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary. You can set it to the maximal input size of the model with `max_length = tokenizer.model_max_length`. stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): String selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` or PyTorch :obj:`torch.Tensor` instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method """ encoded_inputs = self.encode_plus( text, text_pair=text_pair, max_length=max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, pad_to_max_length=pad_to_max_length, return_tensors=return_tensors, **kwargs, ) return encoded_inputs["input_ids"] def encode_plus( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, is_pretokenized: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, **kwargs ) -> BatchEncoding: """ Returns a dictionary containing the encoded sequence or sequence pair and additional information: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. Args: text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method) add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): If set to ``True``, the sequences will be encoded with the special tokens relative to their model. max_length (:obj:`int`, `optional`, defaults to :obj:`None`): If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary You can set it to the maximal input size of the model with `max_length = tokenizer.model_max_length`. stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): String selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. is_pretokenized (:obj:`bool`, defaults to :obj:`False`): Set to True to indicate the input is already tokenized return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` or PyTorch :obj:`torch.Tensor` instead of a list of python integers. return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are token type IDs? <../glossary.html#token-type-ids>`_ return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are attention masks? <../glossary.html#attention-mask>`__ return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return overflowing token information (default False). return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return special tokens mask information (default False). return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return (char_start, char_end) for each token (default False). If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. **kwargs: passed to the `self.tokenize()` method Return: A Dictionary of shape:: { input_ids: list[int], token_type_ids: list[int] if return_token_type_ids is True (default) attention_mask: list[int] if return_attention_mask is True (default) overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. """ def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): return text else: raise ValueError( "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." ) if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers." "To use this feature, change your tokenizer to one deriving from " "transformers1.PreTrainedTokenizerFast." "More information on available tokenizers at " "https://github.com/huggingface/transformers/pull/2674" ) # Throw an error if we can pad because there is no padding token if pad_to_max_length and self.pad_token_id is None: raise ValueError( "Unable to set proper padding strategy as the tokenizer does not have a padding token. " "In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " "or add a new pad token via the function add_special_tokens if you want to use a padding strategy" ) first_ids = get_input_ids(text) second_ids = get_input_ids(text_pair) if text_pair is not None else None return self.prepare_for_model( first_ids, pair_ids=second_ids, max_length=max_length, pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, return_tensors=return_tensors, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, ) def batch_encode_plus( self, batch_text_or_text_pairs: Union[ List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair], List[EncodedInput], List[EncodedInputPair], ], add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, is_pretokenized: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_masks: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_masks: bool = False, return_offsets_mapping: bool = False, return_lengths: bool = False, **kwargs ) -> BatchEncoding: """ Returns a dictionary containing the encoded sequence or sequence pair and additional information: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. Args: batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also: :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): Batch of sequences or pair of sequences to be encoded. This can be a list of string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see details in encode_plus) add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): If set to ``True``, the sequences will be encoded with the special tokens relative to their model. max_length (:obj:`int`, `optional`, defaults to :obj:`None`): If set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): String selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): If set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. is_pretokenized (:obj:`bool`, defaults to :obj:`False`): Set to True to indicate the input is already tokenized return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` or PyTorch :obj:`torch.Tensor` instead of a list of python integers. return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are token type IDs? <../glossary.html#token-type-ids>`_ return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are attention masks? <../glossary.html#attention-mask>`__ return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return overflowing token information (default False). return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return special tokens mask information (default False). return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True to return (char_start, char_end) for each token (default False). If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on Rust-based tokenizers inheriting from PreTrainedTokenizerFast. return_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`): If set the resulting dictionary will include the length of each encoded inputs **kwargs: passed to the `self.tokenize()` method Return: A Dictionary of shape:: { input_ids: list[List[int]], token_type_ids: list[List[int]] if return_token_type_ids is True (default) attention_mask: list[List[int]] if return_attention_mask is True (default) overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. """ def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): return text else: raise ValueError( "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." ) # Throw an error if we can pad because there is no padding token if pad_to_max_length and self.pad_token_id is None: raise ValueError( "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy" ) if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers." "To use this feature, change your tokenizer to one deriving from " "transformers1.PreTrainedTokenizerFast." "More information on available tokenizers at " "https://github.com/huggingface/transformers/pull/2674" ) input_ids = [] for ids_or_pair_ids in batch_text_or_text_pairs: if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized: ids, pair_ids = ids_or_pair_ids else: ids, pair_ids = ids_or_pair_ids, None first_ids = get_input_ids(ids) second_ids = get_input_ids(pair_ids) if pair_ids is not None else None input_ids.append((first_ids, second_ids)) if max_length is None and pad_to_max_length: def total_sequence_length(input_pairs): first_ids, second_ids = input_pairs return len(first_ids) + ( self.num_special_tokens_to_add() if second_ids is None else (len(second_ids) + self.num_special_tokens_to_add(pair=True)) ) max_length = max([total_sequence_length(ids) for ids in input_ids]) batch_outputs = {} for first_ids, second_ids in input_ids: # Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by # the model. It adds special tokens, truncates sequences if overflowing while taking into account # the special tokens and manages a window stride for overflowing tokens outputs = self.prepare_for_model( first_ids, pair_ids=second_ids, max_length=max_length, pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, return_attention_mask=return_attention_masks, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_masks, return_lengths=return_lengths, return_tensors=None, # We will convert the whole batch to tensors at the end ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) if return_tensors is not None: self.convert_to_tensors_(batch_outputs, return_tensors) return BatchEncoding(batch_outputs) def convert_to_tensors_(self, batch_outputs: dict, return_tensors: str) -> None: # Do the tensor conversion in batch for key, value in batch_outputs.items(): if return_tensors == "tf" and is_tf_available(): try: batch_outputs[key] = tf.constant(value) except ValueError: if None in [item for sequence in value for item in sequence]: raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) else: raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) elif return_tensors == "pt" and is_torch_available(): try: batch_outputs[key] = torch.tensor(value) except ValueError: raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) except RuntimeError: if None in [item for sequence in value for item in sequence]: raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) else: raise elif return_tensors is not None: logger.warning( "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( return_tensors ) ) def prepare_for_model( self, ids: List[int], pair_ids: Optional[List[int]] = None, max_length: Optional[int] = None, add_special_tokens: bool = True, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_lengths: bool = False, ) -> BatchEncoding: """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: ids: list of tokenized input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. max_length: maximum length of the returned list. Will truncate by taking into account the special tokens. add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative to their model. stride: window stride for overflowing tokens. Can be useful to remove edge effect when using sequential list of inputs. The overflowing token will contains a part of the previous window of tokens. truncation_strategy: string selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences) - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default: set to model specifics). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). return_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`): If set the resulting dictionary will include the length of each encoded inputs Return: A Dictionary of shape:: { input_ids: list[int], token_type_ids: list[int] if return_token_type_ids is True (default) overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True length: int if return_lengths is True } With the fields: - ``input_ids``: list of token ids to be fed to a model - ``token_type_ids``: list of token type ids to be fed to a model - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. - ``length``: this is the length of ``input_ids`` """ pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Truncation: Handle max sequence length total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) if max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation_strategy, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) # Build output dictionnary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Check lengths assert max_length is None or len(encoded_inputs["input_ids"]) <= max_length if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length: logger.warning( "Token indices sequence length is longer than the specified maximum sequence length " "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.model_max_length) ) # Padding needs_to_be_padded = pad_to_max_length and ( max_length and len(encoded_inputs["input_ids"]) < max_length or max_length is None and len(encoded_inputs["input_ids"]) < self.model_max_length and self.model_max_length <= LARGE_INTEGER ) if pad_to_max_length and max_length is None and self.model_max_length > LARGE_INTEGER: logger.warning( "Sequence can't be padded as no maximum length is specified and the model maximum length is too high." ) if needs_to_be_padded: difference = (max_length if max_length is not None else self.model_max_length) - len( encoded_inputs["input_ids"] ) if self.padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference if return_token_type_ids: encoded_inputs["token_type_ids"] = ( encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference ) if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference elif self.padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) if return_token_type_ids: encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ "token_type_ids" ] if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] else: raise ValueError("Invalid padding strategy:" + str(self.padding_side)) else: if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) if return_lengths: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) # Prepare model inputs as tensors if asked if return_tensors == "tf" and is_tf_available(): encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]]) if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]]) if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]]) elif return_tensors == "pt" and is_torch_available(): encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]]) if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]]) if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]]) elif return_tensors is not None: logger.warning( "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( return_tensors ) ) return BatchEncoding(encoded_inputs) def prepare_for_tokenization(self, text: str, **kwargs) -> str: """ Performs any necessary transformations before tokenization """ return text def truncate_sequences( self, ids: List[int], pair_ids: Optional[List[int]] = None, num_tokens_to_remove: int = 0, truncation_strategy: str = "longest_first", stride: int = 0, ) -> Tuple[List[int], List[int], List[int]]: """ Truncates a sequence pair in place to the maximum length. Args: ids: list of tokenized input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): number of tokens to remove using the truncation strategy truncation_strategy: string selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences). Overflowing tokens only contains overflow from the first sequence. - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. """ if num_tokens_to_remove <= 0: return ids, pair_ids, [] if truncation_strategy == "longest_first": overflowing_tokens = [] for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): overflowing_tokens = [ids[-1]] + overflowing_tokens ids = ids[:-1] else: pair_ids = pair_ids[:-1] window_len = min(len(ids), stride) if window_len > 0: overflowing_tokens = ids[-window_len:] + overflowing_tokens elif truncation_strategy == "only_first": assert len(ids) > num_tokens_to_remove window_len = min(len(ids), stride + num_tokens_to_remove) overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] elif truncation_strategy == "only_second": assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove window_len = min(len(pair_ids), stride + num_tokens_to_remove) overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] elif truncation_strategy == "do_not_truncate": raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.") else: raise ValueError( "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" ) return (ids, pair_ids, overflowing_tokens) def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: if token_ids_1 is None: return len(token_ids_0) * [0] return [0] * len(token_ids_0) + [1] * len(token_ids_1) def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. This implementation does not add special tokens. """ if token_ids_1 is None: return token_ids_0 return token_ids_0 + token_ids_1 def get_special_tokens_mask( self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids for sequence pairs already_has_special_tokens: (default False) Set to True if the token list is already formated with special tokens for the model Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[int, List[int]]: """ Converts a single index or a sequence of indices (integers) in a token " (resp.) a sequence of tokens (str), using the vocabulary and added tokens. Args: skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False """ if isinstance(ids, int): if ids in self.added_tokens_decoder: return self.added_tokens_decoder[ids] else: return self._convert_id_to_token(ids) tokens = [] for index in ids: index = int(index) if skip_special_tokens and index in self.all_special_ids: continue if index in self.added_tokens_decoder: tokens.append(self.added_tokens_decoder[index]) else: tokens.append(self._convert_id_to_token(index)) return tokens def _convert_id_to_token(self, index: int) -> str: raise NotImplementedError def convert_tokens_to_string(self, tokens: List[str]) -> str: """ Converts a sequence of tokens (string) in a single string. The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) but we often want to remove sub-word tokenization artifacts at the same time. """ return " ".join(self.convert_ids_to_tokens(tokens)) def decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True ) -> str: """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. Args: token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. skip_special_tokens: if set to True, will replace special tokens. clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. """ filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) # To avoid mixing byte-level and unicode for byte-level BPT # we need to build string separatly for added tokens and byte-level tokens # cf. https://github.com/huggingface/transformers/issues/1133 sub_texts = [] current_sub_text = [] for token in filtered_tokens: if skip_special_tokens and token in self.all_special_ids: continue if token in self.added_tokens_encoder: if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) current_sub_text = [] sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) text = " ".join(sub_texts) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text else: return text def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]: return [self.decode(seq, **kwargs) for seq in sequences] @staticmethod def clean_up_tokenization(out_string: str) -> str: """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. """ out_string = ( out_string.replace(" .", ".") .replace(" ?", "?") .replace(" !", "!") .replace(" ,", ",") .replace(" ' ", "'") .replace(" n't", "n't") .replace(" 'm", "'m") .replace(" 's", "'s") .replace(" 've", "'ve") .replace(" 're", "'re") ) return out_string class PreTrainedTokenizerFast(PreTrainedTokenizer): """ Base class for all fast tokenizers (wrapping HuggingFace tokenizers library). Inherit from PreTrainedTokenizer. Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). Class attributes (overridden by derived classes): - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method. Args: - ``tokenizer`` (`BaseTokenizerFast`): A Fast tokenizer from the HuggingFace tokenizer library (in low level Rust language) - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). no associated max_length can be found in ``max_model_input_sizes``. - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. Should be selected between ['right', 'left'] - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the model ("token_type_ids", "attention_mask"...). - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` """ def __init__(self, tokenizer: BaseTokenizerFast, **kwargs): if not isinstance(tokenizer, BaseTokenizerFast): raise ValueError( "Tokenizer should be an instance of a Tokenizer " "provided by HuggingFace tokenizers library." ) self._tokenizer: BaseTokenizerFast = tokenizer # Initialize all the rest of the kwargs super().__init__(**kwargs) @property def backend_tokenizer(self) -> BaseTokenizerFast: return self._tokenizer @property def decoder(self) -> DecoderFast: return self._tokenizer._tokenizer.decoder @property def is_fast(self) -> bool: return True @property def vocab_size(self) -> int: return self._tokenizer.get_vocab_size(with_added_tokens=False) def __len__(self) -> int: return self._tokenizer.get_vocab_size(with_added_tokens=True) def _maybe_update_backend(self, value): """ Update the backend fast tokenizer. Override method from base class SpecialTokensMixin """ self._tokenizer.add_special_tokens(value) def _convert_encoding( self, encoding: EncodingFast, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, ) -> Dict[str, Any]: """ Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict. Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are lists (overflows) of lists (tokens). If return_tensors is not None, these lists of lists are converted to 2-D tensors for input_ids, token_type_ids and attention_mask. Output shape: (overflows, sequence length) """ if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names if return_overflowing_tokens and encoding.overflowing is not None: encodings = [encoding] + encoding.overflowing else: encodings = [encoding] encoding_dict = defaultdict(list) for e in encodings: encoding_dict["input_ids"].append(e.ids) if return_token_type_ids: encoding_dict["token_type_ids"].append(e.type_ids) if return_attention_mask: encoding_dict["attention_mask"].append(e.attention_mask) if return_special_tokens_mask: encoding_dict["special_tokens_mask"].append(e.special_tokens_mask) if return_offsets_mapping: encoding_dict["offset_mapping"].append(e.offsets) if return_tensors is not None: for key, value in encoding_dict.items(): if return_tensors == "tf" and is_tf_available(): encoding_dict[key] = tf.constant(value) elif return_tensors == "pt" and is_torch_available(): encoding_dict[key] = torch.tensor(value) elif return_tensors is not None: logger.warning( "Unable to convert output to tensors format {}, " "PyTorch or TensorFlow is not available.".format(return_tensors) ) return encoding_dict def _convert_token_to_id_with_added_voc(self, token: int) -> str: index = self._tokenizer.token_to_id(token) if index is None: return self.unk_token_id return index def _convert_id_to_token(self, index: int) -> Optional[str]: return self._tokenizer.id_to_token(int(index)) def get_vocab(self): return self._tokenizer.get_vocab(True) def convert_tokens_to_string(self, tokens: List[int], skip_special_tokens: bool = False) -> str: return self._tokenizer.decode(tokens, skip_special_tokens) def add_tokens(self, new_tokens: List[Union[str, AddedTokenFast]]) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Args: new_tokens: string or list of string or AddedTokenFast. Each string is a token to add. Tokens are only added if they are not already in the vocabulary. AddedTokenFast wrap a string token to let you personnalize it's behavior (Whether this token should only match against single word, whether this token should strip all potential whitespaces on the left side, Whether this token should strip all potential whitespaces on the right side...). See details for AddedToken in HuggingFace tokenizers library. Returns: Number of tokens added to the vocabulary. Examples:: # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. """ if isinstance(new_tokens, str): new_tokens = [new_tokens] return self._tokenizer.add_tokens(new_tokens) def add_special_tokens(self, special_tokens_dict: dict) -> int: # Map special tokens to class attributes (self.pad_token...) super().add_special_tokens(special_tokens_dict) # If the backend tokenizer the only specificities of special tokens are that # - they will never be processed by the model, and # - they will be removed while decoding. # But they are not mapped to special attributes in the backend so we can just # send a list. tokens = [] for token in special_tokens_dict.values(): if isinstance(token, list): tokens += token else: tokens += [token] num_added_tokens = self._tokenizer.add_special_tokens(tokens) return num_added_tokens def num_special_tokens_to_add(self, pair: bool = False) -> int: return self._tokenizer.num_special_tokens_to_add(pair) def tokenize( self, text: TextInput, pair: Optional[TextInput] = None, add_special_tokens: bool = False ) -> List[str]: return self._tokenizer.encode(text, pair, add_special_tokens).tokens def batch_encode_plus( self, batch_text_or_text_pairs: Union[ List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair] ], add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", pad_to_max_length: bool = False, is_pretokenized: bool = False, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_lengths: bool = False, **kwargs ) -> BatchEncoding: if not isinstance(batch_text_or_text_pairs, list): raise ValueError( "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs)) ) # Needed if we have to return a tensor pad_to_max_length = pad_to_max_length or (return_tensors is not None and len(batch_text_or_text_pairs) > 1) # Throw an error if we can pad because there is no padding token if pad_to_max_length and self.pad_token_id is None: raise ValueError("Unable to set proper padding strategy as the tokenizer does not have a padding token") # Set the truncation and padding strategy and restore the initial configuration with truncate_and_pad( tokenizer=self._tokenizer, max_length=max_length, stride=stride, strategy=truncation_strategy, pad_to_max_length=pad_to_max_length, padding_side=self.padding_side, pad_token_id=self.pad_token_id, pad_token_type_id=self.pad_token_type_id, pad_token=self._pad_token, ): # Check for the pretokenized path if is_pretokenized: encodings = [] # Iterate over each sample (we don't know yet if they are pairs or simple input for i, sample in enumerate(batch_text_or_text_pairs): if not isinstance(sample, (list, tuple)): raise TypeError( "batch_encode_plus(..., is_pretokenized=True) requires batch_text_or_text_pairs " "to be either List[List[str]] or List[Tuple[List[str], List[str]]] but sample at " "index {} is of type {}".format(i, type(sample)) ) # Test if we have a pair of sentences by checking the depth of nesting is_pair = bool(len(sample) > 0 and isinstance(sample[0], (list, tuple))) # Take care of the first sequence - we multi-thread over the words encodings_text = EncodingFast.merge( self._tokenizer.encode_batch(sample[0] if is_pair else sample, add_special_tokens=False), growing_offsets=True, ) # Take care of the second sequence if we have a pair if is_pair: encodings_pair = EncodingFast.merge( self._tokenizer.encode_batch([("", s) for s in sample[1]], add_special_tokens=False), growing_offsets=True, ) else: encodings_pair = None # Post-process - truncate/pad and add special tokens encoding = self._tokenizer.post_process(encodings_text, encodings_pair, add_special_tokens) encodings.append(encoding) # Classical path with strings input else: # Avoid thread overhead if only one example. if len(batch_text_or_text_pairs) == 1: if isinstance(batch_text_or_text_pairs[0], (tuple, list)): encodings = self._tokenizer.encode( *batch_text_or_text_pairs[0], add_special_tokens=add_special_tokens ) else: encodings = self._tokenizer.encode( batch_text_or_text_pairs[0], add_special_tokens=add_special_tokens ) encodings = [encodings] else: encodings = self._tokenizer.encode_batch( batch_text_or_text_pairs, add_special_tokens=add_special_tokens ) # Convert encoding to dict # `Tokens` has type: List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]] # with nested dimensions corresponding to batch, overflows, sequence length tokens = [ self._convert_encoding( encoding=encoding, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, ) for encoding in encodings ] # Sanitize the output to have dict[list] from list[dict] sanitized = {} for key in tokens[0].keys(): # To List[List[List[int]]] of shape (batch, overflows, sequence length) stack = [e for item in tokens for e in item[key]] if return_tensors == "tf": stack = tf.stack(stack, axis=0) elif return_tensors == "pt": stack = torch.stack(stack, dim=0) # elif not return_tensors and len(stack) == 1: # stack = stack[0] sanitized[key] = stack # If returning overflowing tokens, we need to return a mapping # from the batch idx to the original sample if return_overflowing_tokens: overflow_to_sample_mapping = flatten([[i] * len(enc["input_ids"]) for i, enc in enumerate(tokens)]) sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping return BatchEncoding(sanitized, encodings) def encode_plus( self, text: Union[TextInput, PreTokenizedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None, add_special_tokens: bool = True, max_length: Optional[int] = None, pad_to_max_length: bool = False, stride: int = 0, truncation_strategy: str = "longest_first", is_pretokenized: bool = False, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, **kwargs ) -> BatchEncoding: # Check for pretokenized path (ie [token1, token2, ..., tokenN] -> [id1, id2, ..., idN] if is_pretokenized: if isinstance(text, list) and len(text) > 0: # Encode through encode_batch with sequence of only one word which will be merged after hand encoding = self._tokenizer.encode_batch(text, add_special_tokens=False) encoding = EncodingFast.merge(encoding, growing_offsets=True) # Let's do the same for pairs if provided if isinstance(text_pair, list): # We prepend empty string before each word so that encoding is aware content is a pair encoding_pair = self._tokenizer.encode_batch( [("", p) for p in text_pair], add_special_tokens=False ) encoding_pair = EncodingFast.merge(encoding_pair, growing_offsets=True) elif text_pair is None: encoding_pair = None else: raise TypeError( "encode_plus(..., is_pretokenized=True) requires text and text_pair to be List[str] " "but got (text={}, text_pair={})".format(type(text), type(text_pair)) ) # Post process and if asked to do so, insert special tokens where needed encoding = self._tokenizer.post_process(encoding, encoding_pair, add_special_tokens) batched_output = BatchEncoding( self._convert_encoding( encoding, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, ), encoding, ) else: raise TypeError( "encode_plus(..., is_pretokenized=True) requires text to be List[str] " "but got (text={}, text_pair={})".format(type(text), type(text_pair)) ) else: batched_input = [(text, text_pair)] if text_pair else [text] batched_output = self.batch_encode_plus( batched_input, add_special_tokens=add_special_tokens, max_length=max_length, stride=stride, truncation_strategy=truncation_strategy, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, pad_to_max_length=pad_to_max_length, **kwargs, ) # Return tensor is None, then we can remove the leading batch axis if not return_tensors: batched_output = BatchEncoding( { key: value[0] if len(value) > 0 and isinstance(value[0], list) else value for key, value in batched_output.items() }, batched_output.encodings, ) return batched_output def decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True ) -> str: text = self._tokenizer.decode(token_ids, skip_special_tokens) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text else: return text def save_vocabulary(self, save_directory: str) -> Tuple[str]: if os.path.isdir(save_directory): files = self._tokenizer.save(save_directory) else: folder, file = os.path.split(os.path.abspath(save_directory)) files = self._tokenizer.save(folder, name=file) return tuple(files) def trim_batch( input_ids, pad_token_id, attention_mask=None, ): """Remove columns that are populated exclusively by pad_token_id""" keep_column_mask = input_ids.ne(pad_token_id).any(dim=0) if attention_mask is None: return input_ids[:, keep_column_mask] else: return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask]) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_xlm.py ================================================ # coding=utf-8 # Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for XLM.""" import json import logging import os import re import sys import unicodedata from typing import List, Optional import sacremoses as sm from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json", "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json", "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json", "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json", "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json", "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json", "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json", "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json", "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json", "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json", }, "merges_file": { "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt", "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt", "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt", "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt", "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt", "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "xlm-mlm-en-2048": 512, "xlm-mlm-ende-1024": 512, "xlm-mlm-enfr-1024": 512, "xlm-mlm-enro-1024": 512, "xlm-mlm-tlm-xnli15-1024": 512, "xlm-mlm-xnli15-1024": 512, "xlm-clm-enfr-1024": 512, "xlm-clm-ende-1024": 512, "xlm-mlm-17-1280": 512, "xlm-mlm-100-1280": 512, } PRETRAINED_INIT_CONFIGURATION = { "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True}, "xlm-mlm-ende-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "de", "1": "en"}, "lang2id": {"de": 0, "en": 1}, }, "xlm-mlm-enfr-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "en", "1": "fr"}, "lang2id": {"en": 0, "fr": 1}, }, "xlm-mlm-enro-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "en", "1": "ro"}, "lang2id": {"en": 0, "ro": 1}, }, "xlm-mlm-tlm-xnli15-1024": { "do_lowercase_and_remove_accent": True, "id2lang": { "0": "ar", "1": "bg", "2": "de", "3": "el", "4": "en", "5": "es", "6": "fr", "7": "hi", "8": "ru", "9": "sw", "10": "th", "11": "tr", "12": "ur", "13": "vi", "14": "zh", }, "lang2id": { "ar": 0, "bg": 1, "de": 2, "el": 3, "en": 4, "es": 5, "fr": 6, "hi": 7, "ru": 8, "sw": 9, "th": 10, "tr": 11, "ur": 12, "vi": 13, "zh": 14, }, }, "xlm-mlm-xnli15-1024": { "do_lowercase_and_remove_accent": True, "id2lang": { "0": "ar", "1": "bg", "2": "de", "3": "el", "4": "en", "5": "es", "6": "fr", "7": "hi", "8": "ru", "9": "sw", "10": "th", "11": "tr", "12": "ur", "13": "vi", "14": "zh", }, "lang2id": { "ar": 0, "bg": 1, "de": 2, "el": 3, "en": 4, "es": 5, "fr": 6, "hi": 7, "ru": 8, "sw": 9, "th": 10, "tr": 11, "ur": 12, "vi": 13, "zh": 14, }, }, "xlm-clm-enfr-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "en", "1": "fr"}, "lang2id": {"en": 0, "fr": 1}, }, "xlm-clm-ende-1024": { "do_lowercase_and_remove_accent": True, "id2lang": {"0": "de", "1": "en"}, "lang2id": {"de": 0, "en": 1}, }, "xlm-mlm-17-1280": { "do_lowercase_and_remove_accent": False, "id2lang": { "0": "ar", "1": "de", "2": "en", "3": "es", "4": "fr", "5": "hi", "6": "it", "7": "ja", "8": "ko", "9": "nl", "10": "pl", "11": "pt", "12": "ru", "13": "sv", "14": "tr", "15": "vi", "16": "zh", }, "lang2id": { "ar": 0, "de": 1, "en": 2, "es": 3, "fr": 4, "hi": 5, "it": 6, "ja": 7, "ko": 8, "nl": 9, "pl": 10, "pt": 11, "ru": 12, "sv": 13, "tr": 14, "vi": 15, "zh": 16, }, }, "xlm-mlm-100-1280": { "do_lowercase_and_remove_accent": False, "id2lang": { "0": "af", "1": "als", "2": "am", "3": "an", "4": "ang", "5": "ar", "6": "arz", "7": "ast", "8": "az", "9": "bar", "10": "be", "11": "bg", "12": "bn", "13": "br", "14": "bs", "15": "ca", "16": "ceb", "17": "ckb", "18": "cs", "19": "cy", "20": "da", "21": "de", "22": "el", "23": "en", "24": "eo", "25": "es", "26": "et", "27": "eu", "28": "fa", "29": "fi", "30": "fr", "31": "fy", "32": "ga", "33": "gan", "34": "gl", "35": "gu", "36": "he", "37": "hi", "38": "hr", "39": "hu", "40": "hy", "41": "ia", "42": "id", "43": "is", "44": "it", "45": "ja", "46": "jv", "47": "ka", "48": "kk", "49": "kn", "50": "ko", "51": "ku", "52": "la", "53": "lb", "54": "lt", "55": "lv", "56": "mk", "57": "ml", "58": "mn", "59": "mr", "60": "ms", "61": "my", "62": "nds", "63": "ne", "64": "nl", "65": "nn", "66": "no", "67": "oc", "68": "pl", "69": "pt", "70": "ro", "71": "ru", "72": "scn", "73": "sco", "74": "sh", "75": "si", "76": "simple", "77": "sk", "78": "sl", "79": "sq", "80": "sr", "81": "sv", "82": "sw", "83": "ta", "84": "te", "85": "th", "86": "tl", "87": "tr", "88": "tt", "89": "uk", "90": "ur", "91": "uz", "92": "vi", "93": "war", "94": "wuu", "95": "yi", "96": "zh", "97": "zh_classical", "98": "zh_min_nan", "99": "zh_yue", }, "lang2id": { "af": 0, "als": 1, "am": 2, "an": 3, "ang": 4, "ar": 5, "arz": 6, "ast": 7, "az": 8, "bar": 9, "be": 10, "bg": 11, "bn": 12, "br": 13, "bs": 14, "ca": 15, "ceb": 16, "ckb": 17, "cs": 18, "cy": 19, "da": 20, "de": 21, "el": 22, "en": 23, "eo": 24, "es": 25, "et": 26, "eu": 27, "fa": 28, "fi": 29, "fr": 30, "fy": 31, "ga": 32, "gan": 33, "gl": 34, "gu": 35, "he": 36, "hi": 37, "hr": 38, "hu": 39, "hy": 40, "ia": 41, "id": 42, "is": 43, "it": 44, "ja": 45, "jv": 46, "ka": 47, "kk": 48, "kn": 49, "ko": 50, "ku": 51, "la": 52, "lb": 53, "lt": 54, "lv": 55, "mk": 56, "ml": 57, "mn": 58, "mr": 59, "ms": 60, "my": 61, "nds": 62, "ne": 63, "nl": 64, "nn": 65, "no": 66, "oc": 67, "pl": 68, "pt": 69, "ro": 70, "ru": 71, "scn": 72, "sco": 73, "sh": 74, "si": 75, "simple": 76, "sk": 77, "sl": 78, "sq": 79, "sr": 80, "sv": 81, "sw": 82, "ta": 83, "te": 84, "th": 85, "tl": 86, "tr": 87, "tt": 88, "uk": 89, "ur": 90, "uz": 91, "vi": 92, "war": 93, "wuu": 94, "yi": 95, "zh": 96, "zh_classical": 97, "zh_min_nan": 98, "zh_yue": 99, }, }, } def get_pairs(word): """ Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length strings) """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs def lowercase_and_remove_accent(text): """ Lowercase and strips accents from a piece of text based on https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py """ text = " ".join(text) text = text.lower() text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output).lower().split(" ") def replace_unicode_punct(text): """ Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl """ text = text.replace(",", ",") text = re.sub(r"。\s*", ". ", text) text = text.replace("、", ",") text = text.replace("”", '"') text = text.replace("“", '"') text = text.replace("∶", ":") text = text.replace(":", ":") text = text.replace("?", "?") text = text.replace("《", '"') text = text.replace("》", '"') text = text.replace(")", ")") text = text.replace("!", "!") text = text.replace("(", "(") text = text.replace(";", ";") text = text.replace("1", "1") text = text.replace("」", '"') text = text.replace("「", '"') text = text.replace("0", "0") text = text.replace("3", "3") text = text.replace("2", "2") text = text.replace("5", "5") text = text.replace("6", "6") text = text.replace("9", "9") text = text.replace("7", "7") text = text.replace("8", "8") text = text.replace("4", "4") text = re.sub(r".\s*", ". ", text) text = text.replace("~", "~") text = text.replace("’", "'") text = text.replace("…", "...") text = text.replace("━", "-") text = text.replace("〈", "<") text = text.replace("〉", ">") text = text.replace("【", "[") text = text.replace("】", "]") text = text.replace("%", "%") return text def remove_non_printing_char(text): """ Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl """ output = [] for char in text: cat = unicodedata.category(char) if cat.startswith("C"): continue output.append(char) return "".join(output) def romanian_preprocessing(text): """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`""" # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219") text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b") # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py text = text.replace("\u0218", "S").replace("\u0219", "s") # s-comma text = text.replace("\u021a", "T").replace("\u021b", "t") # t-comma text = text.replace("\u0102", "A").replace("\u0103", "a") text = text.replace("\u00C2", "A").replace("\u00E2", "a") text = text.replace("\u00CE", "I").replace("\u00EE", "i") return text class XLMTokenizer(PreTrainedTokenizer): """ BPE tokenizer for XLM - Moses preprocessing & tokenization for most supported languages - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP) - (optionally) lower case & normalize all inputs text - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ (ex: "__classify__") to a vocabulary - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): Vocabulary file. merges_file (:obj:`string`): Merges file. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to strip the text when tokenizing (removing excess spaces before and after the string). keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to keep accents when tokenizing. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["","","","","","","","","",""]`): List of additional special tokens. lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`): Dictionary mapping languages string identifiers to their IDs. id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`): Dictionary mapping language IDs to their string identifiers. do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase and remove accents when tokenizing. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, merges_file, unk_token="", bos_token="", sep_token="", pad_token="", cls_token="", mask_token="", additional_special_tokens=[ "", "", "", "", "", "", "", "", "", "", ], lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True, **kwargs ): super().__init__( unk_token=unk_token, bos_token=bos_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs, ) # cache of sm.MosesPunctNormalizer instance self.cache_moses_punct_normalizer = dict() # cache of sm.MosesTokenizer instance self.cache_moses_tokenizer = dict() self.lang_with_custom_tokenizer = set(["zh", "th", "ja"]) # True for current supported model (v1.2.0), False for XLM-17 & 100 self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent self.lang2id = lang2id self.id2lang = id2lang if lang2id is not None and id2lang is not None: assert len(lang2id) == len(id2lang) self.ja_word_tokenizer = None self.zh_word_tokenizer = None with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: merges = merges_handle.read().split("\n")[:-1] merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} def moses_punct_norm(self, text, lang): if lang not in self.cache_moses_punct_normalizer: punct_normalizer = sm.MosesPunctNormalizer(lang=lang) self.cache_moses_punct_normalizer[lang] = punct_normalizer else: punct_normalizer = self.cache_moses_punct_normalizer[lang] return punct_normalizer.normalize(text) def moses_tokenize(self, text, lang): if lang not in self.cache_moses_tokenizer: moses_tokenizer = sm.MosesTokenizer(lang=lang) self.cache_moses_tokenizer[lang] = moses_tokenizer else: moses_tokenizer = self.cache_moses_tokenizer[lang] return moses_tokenizer.tokenize(text, return_str=False, escape=False) def moses_pipeline(self, text, lang): text = replace_unicode_punct(text) text = self.moses_punct_norm(text, lang) text = remove_non_printing_char(text) return text def ja_tokenize(self, text): if self.ja_word_tokenizer is None: try: import Mykytea self.ja_word_tokenizer = Mykytea.Mykytea( "-model %s/local/share/kytea/model.bin" % os.path.expanduser("~") ) except (AttributeError, ImportError): logger.error( "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps" ) logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") logger.error("2. autoreconf -i") logger.error("3. ./configure --prefix=$HOME/local") logger.error("4. make && make install") logger.error("5. pip install kytea") raise return list(self.ja_word_tokenizer.getWS(text)) @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + "",) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token + "" while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break else: new_word.extend(word[i:j]) i = j if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = " ".join(word) if word == "\n ": word = "\n" self.cache[token] = word return word def _tokenize(self, text, lang="en", bypass_tokenizer=False): """ Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses. Details of tokenization: - [sacremoses](https://github.com/alvations/sacremoses): port of Moses - Install with `pip install sacremoses` - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer - Install with `pip install pythainlp` - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea) - Install with the following steps: ``` git clone git@github.com:neubig/kytea.git && cd kytea autoreconf -i ./configure --prefix=$HOME/local make && make install pip install kytea ``` - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*) - Install with `pip install jieba` (*) The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated. Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally, and set `bypass_tokenizer=True` to bypass the tokenizer. Args: - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it. - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE. Returns: List of tokens. """ if lang and self.lang2id and lang not in self.lang2id: logger.error( "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model." ) if bypass_tokenizer: text = text.split() elif lang not in self.lang_with_custom_tokenizer: text = self.moses_pipeline(text, lang=lang) # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step if lang == "ro": text = romanian_preprocessing(text) text = self.moses_tokenize(text, lang=lang) elif lang == "th": text = self.moses_pipeline(text, lang=lang) try: if "pythainlp" not in sys.modules: from pythainlp.tokenize import word_tokenize as th_word_tokenize else: th_word_tokenize = sys.modules["pythainlp"].word_tokenize except (AttributeError, ImportError): logger.error( "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps" ) logger.error("1. pip install pythainlp") raise text = th_word_tokenize(text) elif lang == "zh": try: if "jieba" not in sys.modules: import jieba else: jieba = sys.modules["jieba"] except (AttributeError, ImportError): logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps") logger.error("1. pip install jieba") raise text = " ".join(jieba.cut(text)) text = self.moses_pipeline(text, lang=lang) text = text.split() elif lang == "ja": text = self.moses_pipeline(text, lang=lang) text = self.ja_tokenize(text) else: raise ValueError("It should not reach here") if self.do_lowercase_and_remove_accent and not bypass_tokenizer: text = lowercase_and_remove_accent(text) split_tokens = [] for token in text: if token: split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = "".join(tokens).replace("", " ").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A XLM sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ bos = [self.bos_token_id] sep = [self.sep_token_id] if token_ids_1 is None: return bos + token_ids_0 + sep return bos + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0,)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory): """ Save the vocabulary and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file) ) index = token_index writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_xlm_roberta.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License """ Tokenization classes for XLM-RoBERTa model.""" import logging import os from shutil import copyfile from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model", "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model", "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "xlm-roberta-base": 512, "xlm-roberta-large": 512, "xlm-roberta-large-finetuned-conll02-dutch": 512, "xlm-roberta-large-finetuned-conll02-spanish": 512, "xlm-roberta-large-finetuned-conll03-english": 512, "xlm-roberta-large-finetuned-conll03-german": 512, } class XLMRobertaTokenizer(PreTrainedTokenizer): """ Adapted from RobertaTokenizer and XLNetTokenizer SentencePiece based tokenizer. Peculiarities: - requires `SentencePiece `_ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`str`): Path to the vocabulary file. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", **kwargs ): super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # Original fairseq vocab and spm vocab must be "aligned": # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' # Mimic fairseq token-to-id alignment for the first 4 token self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab self.fairseq_offset = 1 self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A XLM-R sequence has the following format: - single sequence: `` X `` - pair of sequences: `` A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-R does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of zeros. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] @property def vocab_size(self): return len(self.sp_model) + self.fairseq_offset + 1 # Add the token def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text): return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] spm_id = self.sp_model.PieceToId(token) # Need to return unknown token if the SP model returned 0 return spm_id + self.fairseq_offset if spm_id else self.unk_token_id def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/tokenization_xlnet.py ================================================ # coding=utf-8 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for XLNet model.""" import logging import os import unicodedata from shutil import copyfile from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model", "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "xlnet-base-cased": None, "xlnet-large-cased": None, } SPIECE_UNDERLINE = "▁" # Segments (not really needed) SEG_ID_A = 0 SEG_ID_B = 1 SEG_ID_CLS = 2 SEG_ID_SEP = 3 SEG_ID_PAD = 4 class XLNetTokenizer(PreTrainedTokenizer): """ Constructs an XLNet tokenizer. Based on `SentencePiece `__ This tokenizer inherits from :class:`~transformers1.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece `__ file (generally has a .spm extension) that contains the vocabulary necessary to instantiate a tokenizer. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase the input when tokenizing. remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to strip the text when tokenizing (removing excess spaces before and after the string). keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to keep accents when tokenizing. bos_token (:obj:`string`, `optional`, defaults to ""): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. .. note:: When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`string`, `optional`, defaults to ""): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to ""): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (:obj:`string`, `optional`, defaults to ""): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (:obj:`string`, `optional`, defaults to ""): The token used for padding, for example when batching sequences of different lengths. cls_token (:obj:`string`, `optional`, defaults to ""): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`string`, `optional`, defaults to ""): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): Additional special tokens used by the tokenizer. Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES padding_side = "left" def __init__( self, vocab_file, do_lower_case=False, remove_space=True, keep_accents=False, bos_token="", eos_token="", unk_token="", sep_token="", pad_token="", cls_token="", mask_token="", additional_special_tokens=["", ""], **kwargs ): super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs, ) self._pad_token_type_id = 3 try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return len(self.sp_model) def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): if self.remove_space: outputs = " ".join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if not self.keep_accents: outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if self.do_lower_case: outputs = outputs.lower() return outputs def _tokenize(self, text, sample=False): """ Tokenize a string. """ text = self.preprocess_text(text) if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: cur_pieces = cur_pieces[1:] else: cur_pieces[0] = cur_pieces[0][1:] cur_pieces.append(piece[-1]) new_pieces.extend(cur_pieces) else: new_pieces.append(piece) return new_pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An XLNet sequence has the following format: - single sequence: ``X `` - pair of sequences: ``A B `` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return token_ids_0 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] return ([0] * len(token_ids_0)) + [1, 1] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet sequence pair mask has the following format: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 | first sequence | second sequence | CLS segment ID if token_ids_1 is None, only returns the first portion of the mask (0's). Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls_segment_id = [2] if token_ids_1 is None: return len(token_ids_0 + sep) * [0] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. Args: save_directory (:obj:`str`): The directory in which to save the vocabulary. Returns: :obj:`Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/trainer.py ================================================ import json import logging import math import os import random import re import shutil from contextlib import contextmanager from pathlib import Path from typing import Callable, Dict, List, Optional, Tuple import time import numpy as np import torch from packaging import version from torch import nn from torch.utils.data.dataloader import DataLoader from torch.utils.data.dataset import Dataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler from tqdm.auto import tqdm, trange from .data.data_collator import DataCollator, DefaultDataCollator from transformers.modeling_utils import PreTrainedModel from .optimization import AdamW from transformers import get_polynomial_decay_schedule_with_warmup#需要新版才有 from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput from .training_args import TrainingArguments, is_tpu_available try: from apex import amp _has_apex = True except ImportError: _has_apex = False def is_apex_available(): return _has_apex if is_tpu_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl try: from torch.utils.tensorboard import SummaryWriter _has_tensorboard = True except ImportError: try: from tensorboardX import SummaryWriter _has_tensorboard = True except ImportError: _has_tensorboard = False def is_tensorboard_available(): return _has_tensorboard try: import wandb wandb.ensure_configured() if wandb.api.api_key is None: _has_wandb = False wandb.termwarn("W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.") else: _has_wandb = False if os.getenv("WANDB_DISABLED") else True except ImportError: _has_wandb = False def is_wandb_available(): return _has_wandb logger = logging.getLogger(__name__) def set_seed(seed: int): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # ^^ safe to call this function even if cuda is not available @contextmanager def torch_distributed_zero_first(local_rank: int): """ Decorator to make all processes in distributed training wait for each local_master to do something. """ if local_rank not in [-1, 0]: torch.distributed.barrier() yield if local_rank == 0: torch.distributed.barrier() class SequentialDistributedSampler(Sampler): """ Distributed Sampler that subsamples indicies sequentially, making it easier to collate all results at the end. Even though we only use this sampler for eval and predict (no training), which means that the model params won't have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add extra samples to the sampler to make it evenly divisible (like in `DistributedSampler`) to make it easy to `gather` or `reduce` resulting tensors at the end of the loop. """ def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: if not torch.distributed.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = torch.distributed.get_world_size() if rank is None: if not torch.distributed.is_available(): raise RuntimeError("Requires distributed package to be available") rank = torch.distributed.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas def __iter__(self): indices = list(range(len(self.dataset))) # add extra samples to make it evenly divisible indices += indices[: (self.total_size - len(indices))] assert len(indices) == self.total_size # subsample indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples def get_tpu_sampler(dataset: Dataset): if xm.xrt_world_size() <= 1: return RandomSampler(dataset) return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) class Trainer: """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. """ model: PreTrainedModel args: TrainingArguments train_dataset: Optional[Dataset] eval_dataset: Optional[Dataset] compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None prediction_loss_only: bool tb_writer: Optional["SummaryWriter"] = None optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None global_step: Optional[int] = None epoch: Optional[float] = None def __init__( self, model: PreTrainedModel, args: TrainingArguments, train_dataLoader: Optional[DataLoader] = None, eval_dataLoader: Optional[DataLoader] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(args.device) self.args = args self.train_dataLoader = train_dataLoader self.eval_dataLoader = eval_dataLoader self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) if is_wandb_available(): self._setup_wandb() else: logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True) if is_tpu_available(): # Set an xla_device flag on the model's config. # We'll find a more elegant and not need to do this in the future. self.model.config.xla_device = True def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: # We use the same batch_size as for eval. if is_tpu_available(): sampler = SequentialDistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal() ) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) else: sampler = SequentialSampler(test_dataset) data_loader = DataLoader( test_dataset, sampler=sampler, batch_size=self.args.eval_batch_size, ) return data_loader def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: """ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init, or override this method in a subclass. """ if self.optimizers is not None: return self.optimizers # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps,lr_end=self.args.lr_end ) return optimizer, scheduler def _setup_wandb(self): """ Setup the optional Weights & Biases (`wandb`) integration. One can override this method to customize the setup if needed. Find more information at https://docs.wandb.com/huggingface You can also override the following environment variables: Environment: WANDB_WATCH: (Optional, ["gradients", "all", "false"]) "gradients" by default, set to "false" to disable gradient logging or "all" to log gradients and parameters WANDB_PROJECT: (Optional): str - "huggingface" by default, set this to a custom string to store results in a different project WANDB_DISABLED: (Optional): boolean - defaults to false, set to "true" to disable wandb entirely """ logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"') wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) # keep track of model topology and gradients if os.getenv("WANDB_WATCH") != "false": wandb.watch( self.model, log=os.getenv("WANDB_WATCH", "gradients"), log_freq=max(100, self.args.logging_steps) ) def num_examples(self, dataloader: DataLoader) -> int: """ Helper to get num of examples from a DataLoader, by accessing its Dataset. """ return len(dataloader.dataset) def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.train_dataLoader if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 tqdmLoss=0#进度条的loss用滑动平均显示 beta_exp=1 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=True ) for epoch in train_iterator: last=time.time() if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True,ncols=70)#固定下长度,不然要换行 for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue now_loss=self._training_step(model, inputs, optimizer) tr_loss += now_loss #丰富进度条 tqdmLoss=tqdmLoss*0.99+(1-0.99)*now_loss#滑动平均下 beta_exp*=0.99#校正 epoch_iterator.set_description_str(f"epoch:{epoch+1}") epoch_iterator.set_postfix_str(f"loss:{round(tqdmLoss/(1-beta_exp),4)}") if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) logging_loss = tr_loss print()#log前要换行,不然和进度条挤在一起 self._log(logs) print() if self.args.evaluate_during_training: self.evaluate() if self.args.save_steps > 0 and self.global_step % self.args.save_steps==0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}-epoch-{int(self.epoch)}") self.save_model(output_dir) if self.is_world_master(): self._rotate_checkpoints() if is_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_master(): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break print(f"预训练第{epoch}轮耗时:",time.time()-last) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss / self.global_step) def _log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if self.epoch is not None: logs["epoch"] = self.epoch if self.tb_writer: for k, v in logs.items(): self.tb_writer.add_scalar(k, v, self.global_step) if is_wandb_available(): wandb.log(logs, step=self.global_step) output = json.dumps({**logs, **{"step": self.global_step}}) if iterator is not None: iterator.write(output) else: print(output) def _training_step( self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer ) -> float: model.train() for k, v in inputs.items(): inputs[k] = v.to(self.args.device) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers1 (see doc) if self.args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() return loss.item() def is_local_master(self) -> bool: if is_tpu_available(): return xm.is_master_ordinal(local=True) else: return self.args.local_rank in [-1, 0] def is_world_master(self) -> bool: """ This will be True only in one process, even in distributed mode, even when training on multiple machines. """ if is_tpu_available(): return xm.is_master_ordinal(local=False) else: return self.args.local_rank == -1 or torch.distributed.get_rank() == 0 def save_model(self, output_dir: Optional[str] = None): """ Saving best-practices: if you use default names for the model, you can reload it using from_pretrained(). Will only save from the world_master process (unless in TPUs). """ if is_tpu_available(): self._save_tpu(output_dir) elif self.is_world_master(): self._save(output_dir) def _save_tpu(self, output_dir: Optional[str] = None): output_dir = output_dir if output_dir is not None else self.args.output_dir logger.info("Saving model checkpoint to %s", output_dir) if xm.is_master_ordinal(): os.makedirs(output_dir, exist_ok=True) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, PreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") xm.rendezvous("saving_checkpoint") self.model.save_pretrained(output_dir) def _save(self, output_dir: Optional[str] = None): output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", output_dir) # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, PreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") self.model.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, "training_args.bin")) def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]: ordering_and_checkpoint_path = [] glob_checkpoints = [str(x) for x in Path(self.args.output_dir).glob(f"{checkpoint_prefix}-*")] for path in glob_checkpoints: if use_mtime: ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) else: regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path) if regex_match and regex_match.groups(): ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) checkpoints_sorted = sorted(ordering_and_checkpoint_path) checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] return checkpoints_sorted def _rotate_checkpoints(self, use_mtime=False) -> None: if self.args.save_total_limit is None or self.args.save_total_limit <= 0: return # Check if we should delete older checkpoint(s) checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime) if len(checkpoints_sorted) <= self.args.save_total_limit: return number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args.save_total_limit) checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: curEpoch = checkpoint.split('-')[-1] print(checkpoint,curEpoch) if int(curEpoch) % 50 == 0: continue logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) shutil.rmtree(checkpoint) def evaluate( self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None, ) -> Dict[str, float]: """ Run evaluation and return metrics. The calling script will be responsible for providing a method to compute metrics, as they are task-dependent. Args: eval_dataset: (Optional) Pass a dataset if you wish to override the one on the instance. Returns: A dict containing: - the eval loss - the potential metrics computed from the predictions """ eval_dataloader = self.eval_dataLoader output = self._prediction_loop(eval_dataloader, description="Evaluation") self._log(output.metrics) if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) return output.metrics def predict(self, test_dataset: Dataset) -> PredictionOutput: """ Run prediction and return predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in evaluate(). """ test_dataloader = self.get_test_dataloader(test_dataset) return self._prediction_loop(test_dataloader, description="Prediction") def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() if is_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) for inputs in tqdm(dataloader, desc=description): has_labels = any(inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if not prediction_loss_only: if preds is None: preds = logits.detach() else: preds = torch.cat((preds, logits.detach()), dim=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach() else: label_ids = torch.cat((label_ids, inputs["labels"].detach()), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) elif is_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) def distributed_concat(self, tensor: torch.Tensor, num_total_examples: int) -> torch.Tensor: assert self.args.local_rank != -1 output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())] torch.distributed.all_gather(output_tensors, tensor) concat = torch.cat(output_tensors, dim=0) # truncate the dummy elements added by SequentialDistributedSampler output = concat[:num_total_examples] return output ================================================ FILE: code/bert-base-count5/pretrain/transformers1/trainer_tf.py ================================================ """Tensorflow trainer class.""" import logging import math import os from typing import Callable, Dict, Optional import numpy as np import tensorflow as tf from .modeling_tf_utils import TFPreTrainedModel, shape_list from .optimization_tf import GradientAccumulator, create_optimizer from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput from .training_args_tf import TFTrainingArguments logger = logging.getLogger(__name__) class TFTrainer: model: TFPreTrainedModel args: TFTrainingArguments # something similar to a PT Dataset. # This is just temporary before to have # a framework-agnostic approach for datasets. train_dataset: Optional[tf.data.Dataset] eval_dataset: Optional[tf.data.Dataset] compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None prediction_loss_only: bool def __init__( self, model: TFPreTrainedModel, args: TFTrainingArguments, train_dataset: Optional[tf.data.Dataset] = None, eval_dataset: Optional[tf.data.Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, ): self.model = model self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.gradient_accumulator = GradientAccumulator() self._setup_training() def _setup_training(self) -> None: """ Setup the different steps to train a model: - check if all the data are given - create the proper strategy - create the features - prepare the model settings """ self._prepare_dataset() with self.args.strategy.scope(): self._create_optimizer() _ = self.optimizer.iterations self._set_loss_and_metric() self._create_checkpoint_manager() self._create_summary_writer() def _set_loss_and_metric(self) -> None: """ Create the training loss and metric with their name. Allowed names are those listed in the Tensorflow documentation and those contained in the transformers1 library. """ try: self.loss = tf.keras.losses.get( { "class_name": self.args.loss_name, "config": {"from_logits": True, "reduction": tf.keras.losses.Reduction.NONE}, } ) except TypeError: self.loss = tf.keras.losses.get( {"class_name": self.args.loss_name, "config": {"reduction": tf.keras.losses.Reduction.NONE}} ) def _create_summary_writer(self) -> None: """ Create a summary writer to be able to read the logs in Tensorboard. """ self.writer = tf.summary.create_file_writer(self.args.logging_dir) def _prepare_dataset(self) -> None: """ Prepare the training, validation and test data. """ if self.train_dataset is not None: self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy() if self.args.max_steps > 0: self.train_steps = self.args.max_steps else: self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size) self.train_dataset = ( self.train_dataset.cache() .shuffle(self.num_train_examples) .batch(self.args.train_batch_size) .prefetch(tf.data.experimental.AUTOTUNE) ) if self.args.max_steps > 0: self.train_dataset = self.train_dataset.repeat(-1) self.train_dataset = self.args.strategy.experimental_distribute_dataset(self.train_dataset) else: self.train_steps = 0 if self.eval_dataset is not None: self.eval_dataset = ( self.eval_dataset.batch(self.args.eval_batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE) ) self.eval_dataset = self.args.strategy.experimental_distribute_dataset(self.eval_dataset) def _create_optimizer(self) -> None: """ Create the training optimizer with its name. Allowed names are those listed in the Tensorflow documentation and those contained in the transformers1 library. """ if self.args.optimizer_name == "adamw": self.optimizer = create_optimizer( self.args.learning_rate, self.train_steps, self.args.warmup_steps, self.args.end_lr ) else: try: self.optimizer = tf.keras.optimizers.get( { "class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate, "epsilon": self.args.adam_epsilon}, } ) except TypeError: # This is for the case where the optimizer is not Adam-like such as SGD self.optimizer = tf.keras.optimizers.get( {"class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate}} ) logger.info("Created an/a {} optimizer".format(self.args.optimizer_name)) def _create_checkpoint_manager(self, max_to_keep: int = 5, load_model: bool = True) -> None: """ Create a checkpoint manager in order to be able to make the training fault-tolerant. Args: max_to_keep: the maximum number of checkpoints to keep in the checkpoint path. load_model: if we want to start the training from the latest checkpoint. """ ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=max_to_keep) if load_model: ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial() @tf.function def _evaluate_steps(self, per_replica_features, per_replica_labels): """ One step evaluation across replica. Args: per_replica_features: the batched features. per_replica_labels: the batched labels. Returns: The loss corresponding to the given batch. """ per_replica_loss, per_replica_logits = self.args.strategy.experimental_run_v2( self._run_model, args=(per_replica_features, per_replica_labels, False) ) try: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) except ValueError: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) return reduced_loss, per_replica_logits def _prediction_loop( self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: logger.info("***** Running %s *****", description) logger.info(" Batch size = %d", self.args.eval_batch_size) label_ids: np.ndarray = None preds: np.ndarray = None step: int = 1 for features, labels in dataset: step = tf.convert_to_tensor(step, dtype=tf.int64) loss, logits = self._evaluate_steps(features, labels) loss = tf.reduce_mean(loss) if not prediction_loss_only: if self.args.n_gpu > 1: for val in logits.values: if preds is None: preds = val.numpy() else: preds = np.append(preds, val.numpy(), axis=0) for val in labels.values: if label_ids is None: label_ids = val.numpy() else: label_ids = np.append(label_ids, val.numpy(), axis=0) else: if preds is None: preds = logits.numpy() else: preds = np.append(preds, logits.numpy(), axis=0) if label_ids is None: label_ids = labels.numpy() else: label_ids = np.append(label_ids, labels.numpy(), axis=0) step += 1 if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} metrics["eval_loss"] = loss.numpy() for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) def evaluate( self, eval_dataset: Optional[tf.data.Dataset] = None, prediction_loss_only: Optional[bool] = None ) -> Dict[str, float]: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. """ if eval_dataset is None: eval_dataset = self.eval_dataset output = self._prediction_loop(eval_dataset, description="Evaluation") return output.metrics def train(self) -> None: """ Train method to train the model. """ if self.args.debug: tf.summary.trace_on(graph=True, profiler=True) self.gradient_accumulator.reset() iterations = self.optimizer.iterations if iterations.numpy() > 0: logger.info("Start the training from the last checkpoint") start_epoch = (iterations.numpy() // self.train_steps) + 1 else: start_epoch = 1 tf.summary.experimental.set_step(iterations) epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_train_examples) logger.info(" Num Epochs = %d", epochs) logger.info(" Total optimization steps = %d", self.train_steps) for epoch in range(start_epoch, int(epochs + 1)): for training_loss in self._training_steps(): step = iterations.numpy() if self.args.debug: with self.writer.as_default(): tf.summary.scalar("loss", training_loss, step=step) if step == 1 and self.args.debug: with self.writer.as_default(): tf.summary.trace_export(name="training", step=step, profiler_outdir=self.args.logging_dir) if self.args.evaluate_during_training and step % self.args.eval_steps == 0: logs = {} results = self.evaluate() for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value if callable(self.optimizer.learning_rate): logs["learning_rate"] = self.optimizer.learning_rate(step).numpy() else: logs["learning_rate"] = self.optimizer.learning_rate.numpy() logger.info("Epoch {} Step {} Validation Metrics {}".format(epoch, step, logs)) with self.writer.as_default(): for k, v in logs.items(): tf.summary.scalar(k, v, step=step) if step % self.args.logging_steps == 0: logger.info("Epoch {} Step {} Train Loss {:.4f}".format(epoch, step, training_loss.numpy())) if step % self.args.save_steps == 0: ckpt_save_path = self.model.ckpt_manager.save() logger.info("Saving checkpoint for step {} at {}".format(step, ckpt_save_path)) if step % self.train_steps == 0: break def _training_steps(self): """ Returns a generator over training steps (i.e. parameters update). """ for i, loss in enumerate(self._accumulate_next_gradients()): if i % self.args.gradient_accumulation_steps == 0: self._apply_gradients() yield loss @tf.function def _apply_gradients(self): """Applies the gradients (cross-replica).""" self.args.strategy.experimental_run_v2(self._step) def _step(self): """Applies gradients and resets accumulation.""" gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync gradients = [ gradient / tf.cast(gradient_scale, gradient.dtype) for gradient in self.gradient_accumulator.gradients ] gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients] self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) self.gradient_accumulator.reset() def _accumulate_next_gradients(self): """Accumulates the gradients from the next element in dataset.""" iterator = iter(self.train_dataset) @tf.function def _accumulate_next(): per_replica_features, per_replica_labels = next(iterator) return self._accumulate_gradients(per_replica_features, per_replica_labels) while True: try: yield _accumulate_next() except tf.errors.OutOfRangeError: break def _accumulate_gradients(self, per_replica_features, per_replica_labels): """Accumulates the gradients across all the replica.""" per_replica_loss = self.args.strategy.experimental_run_v2( self._forward, args=(per_replica_features, per_replica_labels) ) try: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) except ValueError: reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) return reduced_loss def _forward(self, features, labels): """Forwards a training example and accumulates the gradients.""" per_example_loss, _ = self._run_model(features, labels, True) gradients = tf.gradients(per_example_loss, self.model.trainable_variables) gradients = [ g if g is not None else tf.zeros_like(v) for g, v in zip(gradients, self.model.trainable_variables) ] self.gradient_accumulator(gradients) return per_example_loss def _run_model(self, features, labels, training): """ Computes the loss of the given features and labels pair. Args: features: the batched features. labels: the batched labels. training: run the model in training mode or not """ if self.args.mode == "text-classification" or self.args.mode == "token-classification": logits = self.model(features, training=training)[0] else: logits = self.model(features, training=training) if self.args.mode == "token-classification": active_loss = tf.reshape(labels, (-1,)) != -1 reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) loss = self.loss(labels, reduced_logits) elif self.args.mode == "question-answering": start_loss = self.loss(labels["start_position"], logits[0]) end_loss = self.loss(labels["end_position"], logits[1]) loss = (start_loss + end_loss) / 2.0 else: loss = self.loss(labels, logits) loss += sum(self.model.losses) * (1.0 / self.args.n_gpu) return loss, logits def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput: """ Run prediction and return predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in evaluate(). Args: test_dataset: something similar to a PT Dataset. This is just temporary before to have a framework-agnostic approach for datasets. """ test_dataset = test_dataset.batch(self.args.eval_batch_size) test_dataset = self.args.strategy.experimental_distribute_dataset(test_dataset) return self._prediction_loop(test_dataset, description="Prediction") def save_model(self) -> None: """ Save the pretrained model and create a Tensorflow saved model. """ logger.info("Saving model in {}".format(self.args.output_dir)) path = os.path.join(self.args.output_dir, "saved_model") logger.info("Saving model in {}".format(path)) os.makedirs(path, exist_ok=True) self.model.save_pretrained(self.args.output_dir) ================================================ FILE: code/bert-base-count5/pretrain/transformers1/trainer_utils.py ================================================ from typing import Dict, NamedTuple, Optional import numpy as np class EvalPrediction(NamedTuple): """ Evaluation output (always contains labels), to be used to compute metrics. """ predictions: np.ndarray label_ids: np.ndarray class PredictionOutput(NamedTuple): predictions: np.ndarray label_ids: Optional[np.ndarray] metrics: Optional[Dict[str, float]] class TrainOutput(NamedTuple): global_step: int training_loss: float PREFIX_CHECKPOINT_DIR = "checkpoint" ================================================ FILE: code/bert-base-count5/pretrain/transformers1/training_args.py ================================================ import dataclasses import json import logging from dataclasses import dataclass, field from typing import Any, Dict, Optional, Tuple from .file_utils import cached_property, is_torch_available, torch_required if is_torch_available(): import torch try: import torch_xla.core.xla_model as xm _has_tpu = True except ImportError: _has_tpu = False @torch_required def is_tpu_available(): return _has_tpu logger = logging.getLogger(__name__) @dataclass class TrainingArguments: """ TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop itself**. Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line. """ output_dir: str = field( metadata={"help": "The output directory where the model predictions and checkpoints will be written."} ) overwrite_output_dir: bool = field( default=False, metadata={ "help": ( "Overwrite the content of the output directory." "Use this to continue training if output_dir points to a checkpoint directory." ) }, ) do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."}) do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) evaluate_during_training: bool = field( default=False, metadata={"help": "Run evaluation during training at each logging step."}, ) per_device_train_batch_size: int = field( default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."} ) per_device_eval_batch_size: int = field( default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."} ) per_gpu_train_batch_size: Optional[int] = field( default=None, metadata={ "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. " "Batch size per GPU/TPU core/CPU for training." }, ) per_gpu_eval_batch_size: Optional[int] = field( default=None, metadata={ "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred." "Batch size per GPU/TPU core/CPU for evaluation." }, ) gradient_accumulation_steps: int = field( default=1, metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, ) learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."}) lr_end: float = field(default=1e-5, metadata={"help": "学习率最后衰减到多少."}) weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."}) adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."}) max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."}) max_steps: int = field( default=-1, metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."}, ) warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."}) logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"}) logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) save_total_limit: Optional[int] = field( default=None, metadata={ "help": ( "Limit the total amount of checkpoints." "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" ) }, ) no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"}) seed: int = field(default=42, metadata={"help": "random seed for initialization"}) fp16: bool = field( default=False, metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"}, ) fp16_opt_level: str = field( default="O1", metadata={ "help": ( "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html" ) }, ) local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) tpu_num_cores: Optional[int] = field( default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"} ) tpu_metrics_debug: bool = field(default=False, metadata={"help": "TPU: Whether to print debug metrics"}) @property def train_batch_size(self) -> int: if self.per_gpu_train_batch_size: logger.warning( "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " "version. Using `--per_device_train_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size return per_device_batch_size * max(1, self.n_gpu) @property def eval_batch_size(self) -> int: if self.per_gpu_eval_batch_size: logger.warning( "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " "version. Using `--per_device_eval_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size return per_device_batch_size * max(1, self.n_gpu) @cached_property @torch_required def _setup_devices(self) -> Tuple["torch.device", int]: logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") n_gpu = 0 elif is_tpu_available(): device = xm.xla_device() n_gpu = 0 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) n_gpu = 1 return device, n_gpu @property @torch_required def device(self) -> "torch.device": return self._setup_devices[0] @property @torch_required def n_gpu(self): return self._setup_devices[1] def to_json_string(self): """ Serializes this instance to a JSON string. """ return json.dumps(dataclasses.asdict(self), indent=2) def to_sanitized_dict(self) -> Dict[str, Any]: """ Sanitized serialization to use with TensorBoard’s hparams """ d = dataclasses.asdict(self) valid_types = [bool, int, float, str] if is_torch_available(): valid_types.append(torch.Tensor) return {k: v if type(v) in valid_types else str(v) for k, v in d.items()} ================================================ FILE: code/bert-base-count5/pretrain/transformers1/training_args_tf.py ================================================ import logging from dataclasses import dataclass, field from typing import Tuple from .file_utils import cached_property, is_tf_available, tf_required from .training_args import TrainingArguments logger = logging.getLogger(__name__) if is_tf_available(): import tensorflow as tf @dataclass class TFTrainingArguments(TrainingArguments): optimizer_name: str = field( default="adam", metadata={ "help": 'Name of a Tensorflow optimizer among "adadelta, adagrad, adam, adamax, ftrl, nadam, rmsprop, sgd, adamw"' }, ) mode: str = field( default="text-classification", metadata={"help": 'Type of task, one of "text-classification", "token-classification", "question-answering"'}, ) loss_name: str = field( default="SparseCategoricalCrossentropy", metadata={ "help": "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses" }, ) tpu_name: str = field( default=None, metadata={"help": "Name of TPU"}, ) end_lr: float = field( default=0, metadata={"help": "End learning rate for optimizer"}, ) eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."}) debug: bool = field( default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"} ) @cached_property @tf_required def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]: logger.info("Tensorflow: setting up strategy") gpus = tf.config.list_physical_devices("GPU") if self.no_cuda: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: try: if self.tpu_name: tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name) else: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() except ValueError: tpu = None if tpu: tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) elif len(gpus) == 0: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") elif len(gpus) == 1: strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") elif len(gpus) > 1: # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` strategy = tf.distribute.MirroredStrategy() else: raise ValueError("Cannot find the proper strategy please check your environment properties.") return strategy @property @tf_required def strategy(self) -> "tf.distribute.Strategy": return self._setup_strategy @property @tf_required def n_gpu(self) -> int: return self._setup_strategy.num_replicas_in_sync ================================================ FILE: code/bert-base-count5/pretrain/transformers1/try.py ================================================ from transformers import TFAlbertForMaskedLM, TFAlbertModel, TFAlbertForSequenceClassification, AlbertForMaskedLM import os checkpoint = "albert-base-v1" model = AlbertForMaskedLM.from_pretrained(checkpoint) if not os.path.exists("~/saved/" + checkpoint): os.makedirs("~/saved/" + checkpoint) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint, from_pt=True) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertModel.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForSequenceClassification.from_pretrained('~/saved/' + checkpoint) print("nice model") ================================================ FILE: code/bert-base-count5/pretrain/transformers1/utils_encoder_decoder.py ================================================ # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Classes to support Encoder-Decoder architectures """ def prepare_encoder_decoder_model_kwargs(**kwargs): """ Prepare the encoder and decoder's keyword arguments. Keyword arguments come in 3 flavors: - encoder-specific (prefixed by `encoder_`) - decoder-specific (prefixed by `decoder_`) - those that apply to the model as whole. We let the specific kwargs override the common ones in case of conflict. """ kwargs_common = { argument: value for argument, value in kwargs.items() if not argument.startswith("encoder_") and not argument.startswith("decoder_") } if "input_ids" in kwargs_common: kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids") decoder_kwargs = kwargs_common.copy() encoder_kwargs = kwargs_common.copy() encoder_kwargs.update( {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")} ) decoder_kwargs.update( {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")} ) decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None) return encoder_kwargs, decoder_kwargs ================================================ FILE: code/bert-base-count5-len32/finetuning/.ipynb_checkpoints/PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved. \n", "Licensed under the MIT License." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inference PyTorch Bert Model with ONNX Runtime on GPU" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this tutorial, you'll learn how to load a Bert model from PyTorch, convert it to ONNX, and inference it for high performance using ONNX Runtime and NVIDIA GPU. In the following sections, we are going to use the Bert model trained with Stanford Question Answering Dataset (SQuAD) dataset as an example. Bert SQuAD model is used in question answering scenarios, where the answer to every question is a segment of text from the corresponding reading passage, or the question might be unanswerable.\n", "\n", "This notebook is for GPU inference. For CPU inference, please look at another notebook [Inference PyTorch Bert Model with ONNX Runtime on CPU](PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Prerequisites ##\n", "It requires your machine to have a GPU, and a python environment with [PyTorch](https://pytorch.org/) installed before running this notebook.\n", "\n", "#### GPU Environment Setup using AnaConda\n", "\n", "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n", "\n", "```console\n", "conda create -n gpu_env python=3.7\n", "conda activate gpu_env\n", "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n", "conda install -c anaconda ipykernel\n", "conda install -c conda-forge ipywidgets\n", "python -m ipykernel install --user --name=gpu_env_py37\n", "jupyter notebook\n", "```\n", "Finally, launch Jupyter Notebook and you can choose gpu_env_py37 as kernel to run this notebook.\n", "\n", "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the corresponding version in [requirements](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements). If the version is different from above cudatoolkit version, you have to install them separately, and add their bin directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Skipping onnxruntime-gpu as it is not installed.\u001b[0m\r\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip uninstall --quiet --yes onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet onnxruntime-gpu\n", "!{sys.executable} -m pip install --quiet --upgrade transformers\n", "!{sys.executable} -m pip install --quiet --upgrade onnxconverter_common\n", "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n", "!{sys.executable} -m pip install --quiet wget netron pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Load Pretrained Bert model ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We begin by downloading the SQuAD data file and store them in the specified location. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "cache_dir = \"./squad\"\n", "if not os.path.exists(cache_dir):\n", " os.makedirs(cache_dir)\n", "\n", "predict_file_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json\"\n", "predict_file = os.path.join(cache_dir, \"dev-v1.1.json\")\n", "if not os.path.exists(predict_file):\n", " import wget\n", " print(\"Start downloading predict file.\")\n", " wget.download(predict_file_url, predict_file)\n", " print(\"Predict file downloaded.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's first define some constant variables." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Whether allow overwriting existing ONNX model and download the latest script from GitHub\n", "enable_overwrite = True\n", "\n", "# Total samples to inference, so that we can get average latency\n", "total_samples = 1000\n", "\n", "# ONNX opset version\n", "opset_version=11" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Specify some model configuration variables." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# For fine-tuned large model, the model name is \"bert-large-uncased-whole-word-masking-finetuned-squad\". Here we use bert-base for demo.\n", "model_name_or_path = \"bert-base-cased\"\n", "max_seq_length = 128\n", "doc_stride = 128\n", "max_query_length = 64" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Start to load model from pretrained. This step could take a few minutes. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 48/48 [00:04<00:00, 11.28it/s]\n", "convert squad examples to features: 100%|██████████| 1000/1000 [00:09<00:00, 102.15it/s]\n", "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 161306.98it/s]\n" ] } ], "source": [ "# The following code is adapted from HuggingFace transformers\n", "# https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n", "\n", "from transformers import (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "\n", "# Load pretrained model and tokenizer\n", "config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer)\n", "config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n", "tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n", "model = model_class.from_pretrained(model_name_or_path,\n", " from_tf=False,\n", " config=config,\n", " cache_dir=cache_dir)\n", "# load some examples\n", "from transformers.data.processors.squad import SquadV1Processor\n", "\n", "processor = SquadV1Processor()\n", "examples = processor.get_dev_examples(None, filename=predict_file)\n", "\n", "from transformers import squad_convert_examples_to_features\n", "features, dataset = squad_convert_examples_to_features( \n", " examples=examples[:total_samples], # convert enough examples for this notebook\n", " tokenizer=tokenizer,\n", " max_seq_length=max_seq_length,\n", " doc_stride=doc_stride,\n", " max_query_length=max_query_length,\n", " is_training=False,\n", " return_dataset='pt'\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Export the loaded model ##\n", "Once the model is loaded, we can export the loaded PyTorch model to ONNX." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model exported at ./onnx/bert-base-cased-squad_opset11.onnx\n" ] } ], "source": [ "output_dir = \"./onnx\"\n", "if not os.path.exists(output_dir):\n", " os.makedirs(output_dir) \n", "export_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opset{}.onnx'.format(opset_version))\n", "\n", "import torch\n", "use_gpu = torch.cuda.is_available()\n", "device = torch.device(\"cuda\" if use_gpu else \"cpu\")\n", "\n", "# Get the first example data to run the model and export it to ONNX\n", "data = dataset[0]\n", "inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", "}\n", "\n", "# Set model to inference mode, which is required before exporting the model because some operators behave differently in \n", "# inference and training mode.\n", "model.eval()\n", "model.to(device)\n", "\n", "if enable_overwrite or not os.path.exists(export_model_path):\n", " with torch.no_grad():\n", " symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}\n", " torch.onnx.export(model, # model being run\n", " args=tuple(inputs.values()), # model input (or a tuple for multiple inputs)\n", " f=export_model_path, # where to save the model (can be a file or file-like object)\n", " opset_version=opset_version, # the ONNX version to export the model to\n", " do_constant_folding=True, # whether to execute constant folding for optimization\n", " input_names=['input_ids', # the model's input names\n", " 'input_mask', \n", " 'segment_ids'],\n", " output_names=['start', 'end'], # the model's output names\n", " dynamic_axes={'input_ids': symbolic_names, # variable length axes\n", " 'input_mask' : symbolic_names,\n", " 'segment_ids' : symbolic_names,\n", " 'start' : symbolic_names,\n", " 'end' : symbolic_names})\n", " print(\"Model exported at \", export_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. PyTorch Inference ##\n", "Use PyTorch to evaluate an example input for comparison purpose." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PyTorch cuda Inference time = 16.57 ms\n" ] } ], "source": [ "import time\n", "\n", "# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.\n", "latency = []\n", "with torch.no_grad():\n", " for i in range(total_samples):\n", " data = dataset[i]\n", " inputs = {\n", " 'input_ids': data[0].to(device).reshape(1, max_seq_length),\n", " 'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n", " 'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n", " }\n", " start = time.time()\n", " outputs = model(**inputs)\n", " latency.append(time.time() - start)\n", "print(\"PyTorch {} Inference time = {} ms\".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Inference ONNX Model with ONNX Runtime ##\n", "\n", "### CUDA and cuDNN Path\n", "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn):\n", "\n", "* [onnxruntime-gpu v1.3.0](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "* [onnxruntime-gpu v1.2.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.2.0) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", "\n", "During installing PyTorch 1.5, we installed cudatoolkit 10.1.243 in this conda environment. That shall be good for onnxruntime-gpu 1.3.0 in Jupyter Notebook." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n", "add_cuda_path = False\n", "\n", "if add_cuda_path:\n", " # Add path of CUDA 10.0 and CUDNN 7.6 for onnxruntime-gpu 1.0.0 ~ 1.1.2\n", " cuda_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " cudnn_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n", " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n", " else:\n", " if cuda_dir == cudnn_dir:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n", " else:\n", " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### OpenMP Environment Variable\n", "\n", "OpenMP environment variables are optional for GPU inference of standard Bert model. It has little performance impact on Bert model since most nodes are executed in GPU. \n", "\n", "You can find the best setting based on [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n", "\n", "**Attention: Setting environment variables shall be done before importing onnxruntime**. Otherwise, they might not take effect." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Optional. You can change them according to Performance Test Tool result.\n", "#os.environ[\"OMP_NUM_THREADS\"] = '1'\n", "#os.environ[\"OMP_WAIT_POLICY\"] = 'PASSIVE'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are ready to inference the model with ONNX Runtime." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OnnxRuntime gpu Inference time = 4.43 ms\n" ] } ], "source": [ "import psutil\n", "import onnxruntime\n", "import numpy\n", "\n", "assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()\n", "device_name = 'gpu'\n", "\n", "sess_options = onnxruntime.SessionOptions()\n", "\n", "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n", "# Note that this will increase session creation time so enable it for debugging only.\n", "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_{}.onnx\".format(device_name))\n", "\n", "# Please change the value according to best setting in Performance Test Tool result.\n", "sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n", "\n", "session = onnxruntime.InferenceSession(export_model_path, sess_options)\n", "\n", "latency = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n", " ort_inputs = {\n", " 'input_ids': data[0].cpu().reshape(1, max_seq_length).numpy(),\n", " 'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n", " 'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()\n", " }\n", " start = time.time()\n", " ort_outputs = session.run(None, ort_inputs)\n", " latency.append(time.time() - start)\n", " \n", "print(\"OnnxRuntime {} Inference time = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can compare the output of PyTorch and ONNX Runtime. We can see some results are not close. It is because ONNX Runtime uses some approximation in CUDA optimization. Based on our evaluation on SQuAD data set, F1 score is on par for models before and after optimization." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Verifying correctness *****\n", "PyTorch and ONNX Runtime output 0 are close: True\n", "maximum_diff=9.499490261077881e-07 average_diff=1.4225952327251434e-07\n", "PyTorch and ONNX Runtime output 1 are close: True\n", "maximum_diff=6.92903995513916e-07 average_diff=1.2441887520253658e-07\n" ] } ], "source": [ "print(\"***** Verifying correctness *****\")\n", "for i in range(2): \n", " print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[i], outputs[i].cpu(), rtol=1e-02, atol=1e-02))\n", " diff = ort_outputs[i] - outputs[i].cpu().numpy()\n", " max_diff = numpy.max(numpy.abs(diff))\n", " avg_diff = numpy.average(numpy.abs(diff))\n", " print(f'maximum_diff={max_diff} average_diff={avg_diff}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inference with Actual Sequence Length\n", "Note that ONNX model is exported using dynamic length axis. It is recommended to use actual sequence input without padding instead of fixed length input for best performance. Let's see how it can be applied to this model.\n", "\n", "From an example input below, we can see zero padding at the end of each sequence." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[ 101, 1293, 1242, 2557, 1127, 1226, 1104, 1103, 3613, 16429,\n", " 5235, 136, 102, 3613, 16429, 5988, 170, 107, 1353, 1671,\n", " 1992, 1342, 107, 5235, 117, 1107, 1134, 1473, 3683, 3538,\n", " 1125, 170, 1476, 118, 1248, 2595, 4086, 1714, 1104, 2965,\n", " 15897, 1104, 3613, 16429, 119, 1473, 3683, 3538, 3222, 1149,\n", " 2551, 1168, 23759, 1116, 1121, 1506, 1103, 10280, 2231, 1111,\n", " 1103, 1714, 16355, 119, 102, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]],\n", " device='cuda:0'),\n", " 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n", " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# An example input (we can see padding). From attention_mask, we can deduce the actual length.\n", "inputs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The original sequence length is 128. After removing paddings, the sequence length is reduced. Input with smaller sequence length need less computation, thus we can see there is improvement on inference latency. " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average length 101\n", "OnnxRuntime gpu Inference time with actual sequence length = 4.23 ms\n" ] } ], "source": [ "import statistics\n", "\n", "latency = []\n", "lengths = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", " # Instead of using fixed length (128), we can use actual sequence length (less than 128), which helps to get better performance.\n", " actual_sequence_length = sum(data[1].numpy())\n", " lengths.append(actual_sequence_length)\n", " opt_inputs = {\n", " 'input_ids': data[0].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'input_mask': data[1].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length),\n", " 'segment_ids': data[2].numpy()[:actual_sequence_length].reshape(1, actual_sequence_length)\n", " }\n", " start = time.time()\n", " opt_outputs = session.run(None, opt_inputs)\n", " latency.append(time.time() - start)\n", "print(\"Average length\", statistics.mean(lengths))\n", "print(\"OnnxRuntime {} Inference time with actual sequence length = {} ms\".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's compare the output and see whether the results are close.\n", "\n", "**Note**: Need end-to-end evaluation on performance and accuracy if you use this strategy." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "***** Comparing results with/without paddings *****\n", "Output 0 are close: True\n", "Output 1 are close: True\n" ] } ], "source": [ "print(\"***** Comparing results with/without paddings *****\")\n", "for i in range(2):\n", " print('Output {} are close:'.format(i), numpy.allclose(opt_outputs[i], ort_outputs[i][:,:len(opt_outputs[i][0])], rtol=1e-03, atol=1e-03))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Offline Optimization and Test Tools\n", "\n", "It is recommended to try [OnnxRuntime Transformer Model Optimization Tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) on the exported ONNX models. It could help verify whether the model can be fully optimized, and get performance test results." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Transformer Optimizer\n", "\n", "Although OnnxRuntime could optimize Bert model exported by PyTorch. Sometime, model cannot be fully optimized due to different reasons:\n", "* A new subgraph pattern is generated by new version of export tool, and the pattern is not covered by older version of OnnxRuntime. \n", "* The exported model uses dynamic axis and this makes it harder for shape inference of the graph. That blocks some optimization to be applied.\n", "* Some optimization is better to be done offline. Like change input tensor type from int64 to int32 to avoid extra Cast nodes, or convert model to float16 to achieve better performance in V100 or T4 GPU.\n", "\n", "We have python script **optimizer.py**, which is more flexible in graph pattern matching and model conversion (like float32 to float16). You can also use it to verify whether a Bert model is fully optimized.\n", "\n", "In this example, we can see that it introduces optimization that is not provided by onnxruntime: SkipLayerNormalization and bias fusion, which is not fused in OnnxRuntime due to shape inference as mentioned.\n", "\n", "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph.\n", "\n", "Example Usage:\n", "```\n", "from onnxruntime_tools import optimizer\n", "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n", "optimized_model.save_model_to_file(optimized_model_path)\n", "```\n", "\n", "You can also use optimizer_cli like the following:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Float32 Model\n", "Let us optimize the ONNX model using the script. The first example will output model with float32 to store weights. This is the choice for most GPUs without Tensor Core.\n", "\n", "If your GPU (like V100 or T4) has Tensor Core, jump to [Float16 Model](#6.-Model-Optimization-with-Float16) section since that will give you better performance than Float32 model." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n", "\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Optimized Graph\n", "We can open the optimized model using [Netron](https://github.com/lutzroeder/netron) to visualize.\n", "\n", "The graph is like the following:\n", "\n", "\n", "Sometime, optimized graph is slightly different. For example, FastGelu is replaced by BiasGelu for CPU inference; When the option --input_int32 is used, Cast nodes for inputs are removed." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import netron\n", "\n", "# change it to True if want to view the optimized model in browser\n", "enable_netron = False\n", "if enable_netron:\n", " # If you encounter error \"access a socket in a way forbidden by its access permissions\", install Netron as standalone application instead.\n", " netron.start(optimized_fp32_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Performance Test Tool\n", "\n", "The following will create 1000 random inputs of batch_size 1 and sequence length 128, then measure the average latency and throughput numbers.\n", "\n", "Note that the test uses fixed sequence length. If you use [dynamic sequence length](#Inference-with-Actual-Sequence-Length), actual performance depends on the distribution of sequence length.\n", "\n", "**Attention**: Latency numbers from Jupyter Notebook are not accurate. See [Attional Info](#7.-Additional-Info) for more info." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.92 ms, Throughput = 203.24 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.90 ms, Throughput = 203.88 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 5.07 ms, Throughput = 197.16 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.82 ms, Throughput = 207.33 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.93 ms, Throughput = 202.92 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.91 ms, Throughput = 203.55 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.88 ms, Throughput = 204.90 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's load the summary file and take a look. Note that blank value in OMP_NUM_THREADS or OMP_WAIT_POLICY means the environment variable does not exist." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
04.824.534.575.157.258.75207.33112ACTIVENoneTrue
14.884.544.586.477.138.68204.901212PASSIVENoneTrue
24.904.544.576.167.648.82203.88112PASSIVENoneTrue
34.914.554.596.707.438.78203.551212ACTIVENoneTrue
44.924.574.606.507.828.90203.240NoneTrue
54.934.554.596.667.578.80202.92121PASSIVENoneTrue
65.074.564.617.198.119.01197.16121ACTIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 4.82 4.53 4.57 5.15 7.25 \n", "1 4.88 4.54 4.58 6.47 7.13 \n", "2 4.90 4.54 4.57 6.16 7.64 \n", "3 4.91 4.55 4.59 6.70 7.43 \n", "4 4.92 4.57 4.60 6.50 7.82 \n", "5 4.93 4.55 4.59 6.66 7.57 \n", "6 5.07 4.56 4.61 7.19 8.11 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 8.75 207.33 1 12 \n", "1 8.68 204.90 12 12 \n", "2 8.82 203.88 1 12 \n", "3 8.78 203.55 12 12 \n", "4 8.90 203.24 0 \n", "5 8.80 202.92 12 1 \n", "6 9.01 197.16 12 1 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 PASSIVE None True \n", "2 PASSIVE None True \n", "3 ACTIVE None True \n", "4 None True \n", "5 PASSIVE None True \n", "6 ACTIVE None True " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From above result, we can see that latency is very close for different settings. The default setting (intra_op_num_threads=0, OMP_NUM_THREADS and OMP_WAIT_POLICY does not exist) performs the best. \n", "\n", "### Model Results Comparison Tool\n", "\n", "When a BERT model is optimized, some approximation is used in calculation. If your BERT model has three inputs, a script compare_bert_results.py can be used to do a quick verification. The tool will generate some fake input data, and compare the inference outputs of the original and optimized models. If outputs are all close, it is safe to use the optimized model.\n", "\n", "For GPU inference, the absolute or relative difference is larger than those numbers of CPU inference. Note that slight difference in output will not impact final result. We did end-to-end evaluation using SQuAD data set using a fine-tuned squad model, and F1 score is almost the same before/after optimization." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n", "maximum absolute difference=1.9222497940063477e-06\r\n", "maximum relative difference=0.05027933046221733\r\n" ] } ], "source": [ "!python -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Model Optimization with Float16\n", "\n", "The optimizer.py script have an option **--float16** to convert model to use float16 to store weights. After the conversion, it could be faster to run in GPU with tensor cores like V100 or T4.\n", "\n", "Let's run tools to measure the performance on V100. The results show significant performance improvement: latency is about 3.4 ms for float32 model, and 1.8 ms for float16 model." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", " apply: Fused SkipLayerNormalization count: 25\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n", "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.90 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.12 ms, Throughput = 320.00 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.02 ms, Throughput = 331.39 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 332.53 QPS\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 328.67 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.01 ms, Throughput = 331.72 QPS\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.04 ms, Throughput = 329.32 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguouswarmup
03.012.792.812.865.087.16332.53112ACTIVENoneTrue
13.012.802.812.884.527.05331.900NoneTrue
23.012.782.802.925.017.02331.721212ACTIVENoneTrue
33.022.792.802.856.347.04331.39121ACTIVENoneTrue
43.042.802.822.935.567.08329.321212PASSIVENoneTrue
53.042.792.812.926.377.08328.67121PASSIVENoneTrue
63.122.792.822.966.667.20320.00112PASSIVENoneTrue
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.01 2.79 2.81 2.86 5.08 \n", "1 3.01 2.80 2.81 2.88 4.52 \n", "2 3.01 2.78 2.80 2.92 5.01 \n", "3 3.02 2.79 2.80 2.85 6.34 \n", "4 3.04 2.80 2.82 2.93 5.56 \n", "5 3.04 2.79 2.81 2.92 6.37 \n", "6 3.12 2.79 2.82 2.96 6.66 \n", "\n", " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", "0 7.16 332.53 1 12 \n", "1 7.05 331.90 0 \n", "2 7.02 331.72 12 12 \n", "3 7.04 331.39 12 1 \n", "4 7.08 329.32 12 12 \n", "5 7.08 328.67 12 1 \n", "6 7.20 320.00 1 12 \n", "\n", " OMP_WAIT_POLICY contiguous warmup \n", "0 ACTIVE None True \n", "1 None True \n", "2 ACTIVE None True \n", "3 ACTIVE None True \n", "4 PASSIVE None True \n", "5 PASSIVE None True \n", "6 PASSIVE None True " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Throughput Tuning\n", "\n", "Some application need best throughput under some constraint on latency. This can be done by testing performance of different batch sizes. The tool could help on this.\n", "\n", "Here is an example that check the performance of multiple batch sizes (1, 2, 4, 8, 16, 32 and 64) using default settings." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=32 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 16.17 ms, Throughput = 1979.41 QPS\n", "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.00 ms, Throughput = 333.83 QPS\n", "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=2 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 3.59 ms, Throughput = 557.32 QPS\n", "test setting TestSetting(batch_size=64, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=64 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=64,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 29.26 ms, Throughput = 2187.15 QPS\n", "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=4 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 4.32 ms, Throughput = 926.92 QPS\n", "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=8 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 6.32 ms, Throughput = 1266.63 QPS\n", "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 1000 samples for batch_size=16 sequence_length=128\n", "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", "Average latency = 9.60 ms, Throughput = 1666.05 QPS\n", "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n", "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $THREAD_SETTING $GPU_OPTION" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)batch_size
03.002.792.812.864.377.08333.831
13.593.333.353.426.607.54557.322
24.323.984.014.647.238.11926.924
36.325.945.977.618.9610.121266.638
49.609.229.2511.3212.3313.341666.0516
516.1715.8015.9017.3818.8019.931979.4132
629.2628.8929.0130.6332.5333.282187.1564
\n", "
" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", "0 3.00 2.79 2.81 2.86 4.37 \n", "1 3.59 3.33 3.35 3.42 6.60 \n", "2 4.32 3.98 4.01 4.64 7.23 \n", "3 6.32 5.94 5.97 7.61 8.96 \n", "4 9.60 9.22 9.25 11.32 12.33 \n", "5 16.17 15.80 15.90 17.38 18.80 \n", "6 29.26 28.89 29.01 30.63 32.53 \n", "\n", " Latency_P99 Throughput(QPS) batch_size \n", "0 7.08 333.83 1 \n", "1 7.54 557.32 2 \n", "2 8.11 926.92 4 \n", "3 10.12 1266.63 8 \n", "4 13.34 1666.05 16 \n", "5 19.93 1979.41 32 \n", "6 33.28 2187.15 64 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n", "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", "print(\"Float16 model summary from\", latest_result_file)\n", "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'warmup', 'sequence_length']\n", "columns_to_remove.extend(['intra_op_num_threads', 'OMP_NUM_THREADS', 'OMP_WAIT_POLICY', 'contiguous'])\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Additional Info\n", "\n", "Note that running Jupyter Notebook has significant impact on performance result. You can close Jupyter Notebook and other applications, then run the performance test in a console to get more accurate performance numbers.\n", "\n", "We have a [benchmark script](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/run_benchmark.sh). It is recommended to use it measure inference speed of OnnxRuntime.\n", "\n", "[OnnxRuntime C API](https://github.com/microsoft/onnxruntime/blob/master/docs/C_API.md) could get slightly better performance than python API. If you use C API in inference, you can use OnnxRuntime_Perf_Test.exe built from source to measure performance instead.\n", "\n", "Here is the machine configuration that generated the above results. You might get slower or faster result according to your hardware." ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\r\n", " \"gpu\": {\r\n", " \"driver_version\": \"440.64.00\",\r\n", " \"devices\": [\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 14110883840,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " },\r\n", " {\r\n", " \"memory_total\": 16945512448,\r\n", " \"memory_available\": 16932601856,\r\n", " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", " }\r\n", " ]\r\n", " },\r\n", " \"cpu\": {\r\n", " \"brand\": \"Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\",\r\n", " \"cores\": 12,\r\n", " \"logical_cores\": 12,\r\n", " \"hz\": \"2.5940 GHz\",\r\n", " \"l2_cache\": \"256 KB\",\r\n", " \"l3_cache\": \"35840 KB\",\r\n", " \"processor\": \"x86_64\"\r\n", " },\r\n", " \"memory\": {\r\n", " \"total\": 236645588992,\r\n", " \"available\": 222567559168\r\n", " },\r\n", " \"python\": \"3.7.7.final.0 (64 bit)\",\r\n", " \"os\": \"Linux-4.15.0-1089-azure-x86_64-with-debian-stretch-sid\",\r\n", " \"onnxruntime\": {\r\n", " \"version\": \"1.3.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"pytorch\": {\r\n", " \"version\": \"1.5.0\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", " \"tensorflow\": null\r\n", "}\r\n" ] } ], "source": [ "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent" ] } ], "metadata": { "kernelspec": { "display_name": "PyCharm (ccks_ner-master)", "language": "python", "name": "pycharm-de4c0941" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: code/bert-base-count5-len32/finetuning/Config.py ================================================ from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig, ElectraModel, ElectraConfig, ElectraTokenizer, \ RobertaTokenizer, RobertaModel, RobertaConfig from NEZHA.modeling_nezha import NeZhaModel from NEZHA.configuration_nezha import NeZhaConfig MODELS = { 'BertForClass': BertModel, 'BertForClass_MultiDropout': BertModel, 'BertLastTwoCls': BertModel, 'BertLastCls':BertModel, 'BertLastTwoClsPooler': BertModel, 'BertLastTwoEmbeddings': BertModel, 'BertLastTwoEmbeddingsPooler': BertModel, 'BertLastFourCls': BertModel, 'BertLastFourClsPooler': BertModel, 'BertLastFourEmbeddings': BertModel, 'BertLastFourEmbeddingsPooler': BertModel, 'BertDynCls': BertModel, 'BertDynEmbeddings': BertModel, 'BertRNN': BertModel, 'BertCNN': XLNetModel, 'BertRCNN': BertModel, 'XLNet': XLNetModel, 'Electra': ElectraModel, 'NEZHA': NeZhaModel } TOKENIZERS = { 'BertForClass': BertTokenizer, 'BertForClass_MultiDropout': BertTokenizer, 'BertLastTwoCls': BertTokenizer, 'BertLastCls': BertTokenizer, 'BertLastTwoClsPooler': BertTokenizer, 'BertLastTwoEmbeddings': BertTokenizer, 'BertLastTwoEmbeddingsPooler': BertTokenizer, 'BertLastFourCls': BertTokenizer, 'BertLastFourClsPooler': BertTokenizer, 'BertLastFourEmbeddings': BertTokenizer, 'BertLastFourEmbeddingsPooler': BertTokenizer, 'BertDynCls': BertTokenizer, 'BertDynEmbeddings': BertTokenizer, 'BertRNN': BertTokenizer, 'BertCNN': BertTokenizer, 'BertRCNN': BertTokenizer, 'XLNet': XLNetTokenizer, 'Electra': ElectraTokenizer, 'NEZHA': BertTokenizer } CONFIGS = { 'BertForClass': BertConfig, 'BertForClass_MultiDropout': BertConfig, 'BertLastTwoCls': BertConfig, 'BertLastCls': BertConfig, 'BertLastTwoClsPooler': BertConfig, 'BertLastTwoEmbeddings': BertConfig, 'BertLastTwoEmbeddingsPooler': BertConfig, 'BertLastFourCls': BertConfig, 'BertLastFourClsPooler': BertConfig, 'BertLastFourEmbeddings': BertConfig, 'BertLastFourEmbeddingsPooler': BertConfig, 'BertDynCls': BertConfig, 'BertDynEmbeddings': BertConfig, 'BertRNN': BertConfig, 'BertCNN': BertConfig, 'BertRCNN': BertConfig, 'XLNet': XLNetConfig, 'Electra': ElectraConfig, 'NEZHA': NeZhaConfig } ================================================ FILE: code/bert-base-count5-len32/finetuning/NEZHA/configuration_nezha.py ================================================ from transformers import PretrainedConfig NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class NeZhaConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. It is used to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ALBERT `xxlarge `__ architecture. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Args: vocab_size (:obj:`int`, optional, defaults to 30000): Vocabulary size of the ALBERT model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. embedding_size (:obj:`int`, optional, defaults to 128): Dimensionality of vocabulary embeddings. hidden_size (:obj:`int`, optional, defaults to 4096): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (:obj:`int`, optional, defaults to 12): Number of hidden layers in the Transformer encoder. num_hidden_groups (:obj:`int`, optional, defaults to 1): Number of groups for the hidden layers, parameters in the same group are shared. num_attention_heads (:obj:`int`, optional, defaults to 64): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (:obj:`int`, optional, defaults to 16384): The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. inner_group_num (:obj:`int`, optional, defaults to 1): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. hidden_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): The dropout ratio for the attention probabilities. max_position_embeddings (:obj:`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048). type_vocab_size (:obj:`int`, optional, defaults to 2): The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. initializer_range (:obj:`float`, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): The epsilon used by the layer normalization layers. classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): The dropout ratio for attached classifiers. Example:: from transformers import AlbertConfig, AlbertModel # Initializing an ALBERT-xxlarge style configuration albert_xxlarge_configuration = AlbertConfig() # Initializing an ALBERT-base style configuration albert_base_configuration = AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) # Initializing a model from the ALBERT-base style configuration model = AlbertModel(albert_xxlarge_configuration) # Accessing the model configuration configuration = model.config Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained checkpoints. """ pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP model_type = "nezha" def __init__( self, vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, num_hidden_groups=1, num_attention_heads=64, intermediate_size=16384, inner_group_num=1, hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, max_relative_position=64, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, use_relative_position=True, pad_token_id=0, bos_token_id=2, eos_token_id=3, **kwargs ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_hidden_groups = num_hidden_groups self.num_attention_heads = num_attention_heads self.inner_group_num = inner_group_num self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.max_relative_position = max_relative_position self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_relative_position=use_relative_position self.classifier_dropout_prob = classifier_dropout_prob ================================================ FILE: code/bert-base-count5-len32/finetuning/NEZHA/modeling_nezha.py ================================================ import math import os import logging import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_nezha import NeZhaConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.modeling_utils import PreTrainedModel, prune_linear_layer from transformers.models.bert.modeling_bert import ( BertOutput, BertPooler, BertSelfOutput, BertIntermediate, BertOnlyMLMHead, BertOnlyNSPHead, BertPreTrainingHeads, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING, ) logger = logging.getLogger(__name__) _CONFIG_FOR_DOC = "NeZhaConfig" _TOKENIZER_FOR_DOC = "NeZhaTokenizer" NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [] NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {} def load_tf_weights_in_nezha(model, config, tf_checkpoint_path): """Load tf checkpoints in a pytorch model.""" try: import re import numpy as np import tensorflow as tf except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: # logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any( n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step", "good_steps", "loss_scale", 'bad_steps'] for n in name ): logger.info("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: logger.info("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert ( pointer.shape == array.shape ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) return model class NeZhaEmbeddings(nn.Module): """ Construct the embeddings from word, position and token_type embeddings. """ def __init__(self, config): super().__init__() self.use_relative_position = config.use_relative_position self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings def relative_position_encoding(depth, max_length=512, max_relative_position=127): vocab_size = max_relative_position * 2 + 1 range_vec = torch.arange(max_length) range_mat = range_vec.repeat(max_length).view(max_length, max_length) distance_mat = range_mat - torch.t(range_mat) distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) final_mat = distance_mat_clipped + max_relative_position embeddings_table = torch.zeros(vocab_size, depth) position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) embeddings_table[:, 0::2] = torch.sin(position * div_term) embeddings_table[:, 1::2] = torch.cos(position * div_term) embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1) flat_relative_positions_matrix = final_mat.view(-1) one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix, num_classes=vocab_size).float() positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) my_shape = list(final_mat.size()) my_shape.append(depth) positions_encoding = positions_encoding.view(my_shape) return positions_encoding class NeZhaSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings, depth=self.attention_head_size, max_relative_position=config.max_relative_position).to('cuda') def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. if encoder_hidden_states is not None: mixed_key_layer = self.key(encoder_hidden_states) mixed_value_layer = self.value(encoder_hidden_states) attention_mask = encoder_attention_mask else: mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] query_layer_t = query_layer.permute(2, 0, 1, 3) query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, self.attention_head_size) key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1)) key_position_scores_r = key_position_scores.view(from_seq_length, batch_size, num_attention_heads, from_seq_length) key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) attention_scores = attention_scores + key_position_scores_r_t attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = torch.matmul(attention_probs, value_layer) relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :] attention_probs_t = attention_probs.permute(2, 0, 1, 3) attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, to_seq_length) value_position_scores = torch.matmul(attentions_probs_r, relations_values) value_position_scores_r = value_position_scores.view(from_seq_length, batch_size, num_attention_heads, self.attention_head_size) value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) context_layer = context_layer + value_position_scores_r_t context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class NeZhaAttention(nn.Module): def __init__(self, config): super().__init__() self.self = NeZhaSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in self.pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class NeZhaLayer(nn.Module): def __init__(self, config): super().__init__() self.attention = NeZhaAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = NeZhaAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights if self.is_decoder and encoder_hidden_states is not None: cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) outputs = (layer_output,) + outputs return outputs class NeZhaEncoder(nn.Module): def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)]) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, ): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask ) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) class NeZhaPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = NeZhaConfig pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_nezha base_model_prefix = "bert" def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() @add_start_docstrings( "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) class NeZhaModel(NeZhaPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration set to :obj:`True`; an :obj:`encoder_hidden_states` is expected as an input to the forward pass. .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762 """ def __init__(self, config): super().__init__(config) self.config = config self.embeddings = NeZhaEmbeddings(config) self.encoder = NeZhaEncoder(config) self.pooler = BertPooler(config) self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pre-training. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertModel, BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, self.device ) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) class NeZhaForPreTraining(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertPreTrainingHeads(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, next_sentence_label=None, ): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForPreTraining import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) # add hidden states and attention if they are here outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] if labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss outputs = (total_loss,) + outputs return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class NeZhaForMaskedLM(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): Next token prediction loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMaskedLM import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here # Although this may seem awkward, BertForMaskedLM supports two scenarios: # 1. If a tensor that contains the indices of masked labels is provided, # the cross-entropy is the MLM cross-entropy that measures the likelihood # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. masked_lm_labels = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # if model is does not use a causal mask then add a dummy token if self.config.is_decoder is False: assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" attention_mask = torch.cat( [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 ) dummy_token = torch.full( (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device ) input_ids = torch.cat([input_ids, dummy_token], dim=1) return {"input_ids": input_ids, "attention_mask": attention_mask} @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, next_sentence_label=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForNextSentencePrediction import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) seq_relationship_scores = outputs[0] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if next_sentence_label is not None: loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) outputs = (next_sentence_loss,) + outputs return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) class NeZhaForSequenceClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) class NeZhaForMultipleChoice(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForMultipleChoice import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) class NeZhaForTokenClassification(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, position_ids=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForTokenClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) class NeZhaForQuestionAnswering(NeZhaPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = NeZhaModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, head_mask=None, inputs_embeds=None, position_ids=None, start_positions=None, end_positions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForQuestionAnswering import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer.encode_plus(question, text) input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) assert answer == "a nice puppet" """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) ================================================ FILE: code/bert-base-count5-len32/finetuning/model.py ================================================ import torch import random import os from torch import nn, optim import torch.nn.functional as F from transformers.activations import get_activation from Config import * class BertForClass(nn.Module): def __init__(self, config): super(BertForClass, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit class BertForClass_MultiDropout(nn.Module): def __init__(self, config): super(BertForClass_MultiDropout, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.multi_drop = 5 self.multi_dropouts = nn.ModuleList([nn.Dropout(config.dropout) for _ in range(self.multi_drop)]) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) for j, dropout in enumerate(self.multi_dropouts): if j == 0: logit = self.classifier(dropout(concat_out)) / self.multi_drop else: logit += self.classifier(dropout(concat_out)) / self.multi_drop return logit class BertLastTwoCls(nn.Module): def __init__(self, config): super(BertLastTwoCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) logit = self.classifier(pooler_output) return logit class BertLastCls(nn.Module): def __init__(self, config): super(BertLastCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] if self.isDropout: output = self.dropout(pooler_output) logit = self.classifier(output) return logit class BertLastTwoClsPooler(nn.Module): def __init__(self, config): super(BertLastTwoClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddings(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastTwoEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastTwoEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 3, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourCls(nn.Module): def __init__(self, config): super(BertLastFourCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = output[0] pooler_output = output[1] hidden_states = output[2] output = torch.cat( (hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourClsPooler(nn.Module): def __init__(self, config): super(BertLastFourClsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) output = torch.cat( (pooler_output, hidden_states[-1][:, 0], hidden_states[-2][:, 0], hidden_states[-3][:, 0], hidden_states[-4][:, 0]), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddings(nn.Module): def __init__(self, config): super(BertLastFourEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 4, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertLastFourEmbeddingsPooler(nn.Module): def __init__(self, config): super(BertLastFourEmbeddingsPooler, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 5, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) hidden_states1 = torch.mean(hidden_states[-1], dim=1) hidden_states2 = torch.mean(hidden_states[-2], dim=1) hidden_states3 = torch.mean(hidden_states[-3], dim=1) hidden_states4 = torch.mean(hidden_states[-4], dim=1) output = torch.cat( (pooler_output, hidden_states1, hidden_states2, hidden_states3, hidden_states4), dim=1) if self.isDropout: output = self.dropout(output) logit = self.classifier(output) return logit class BertDynCls(nn.Module): def __init__(self, config): super(BertDynCls, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = hidden_states[-(i + 1)][0] weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertDynEmbeddings(nn.Module): def __init__(self, config): super(BertDynEmbeddings, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dynWeight = nn.Linear(self.bert_config.hidden_size, 1) self.dence = nn.Linear(self.bert_config.hidden_size, 512) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(512, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) batch_size = pooler_output.shape[0] hid_avg_list = None weight_list = None for i, hidden in enumerate(hidden_states): hid_avg = torch.mean(hidden_states[-(i + 1)], dim=1) weight = self.dynWeight(hid_avg).repeat(1, self.bert_config.hidden_size) if hid_avg_list is None: hid_avg_list = hid_avg else: hid_avg_list = torch.cat((hid_avg_list, hid_avg), dim=1) if weight_list is None: weight_list = hid_avg else: weight_list = torch.cat((weight_list, weight), dim=1) concat_out = weight_list.mul_(hid_avg_list) concat_out = concat_out.reshape(batch_size, -1, self.bert_config.hidden_size) concat_out = torch.sum(concat_out, dim=1) if self.isDropout: concat_out = self.dropout(concat_out) concat_out = self.dence(concat_out) logit = self.classifier(concat_out) return logit class BertRNN(nn.Module): def __init__(self, config): super(BertRNN, self).__init__() self.rnn_type = "gru" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.1 self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.num_directions = 1 if not self.bidirectional else 2 if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) self.dropout = nn.Dropout(self.drop_out) self.fc_rnn = nn.Linear(self.hidden_dim * self.num_directions, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(sequence_output) else: output, (hidden, cell) = self.rnn(sequence_output) # output = [ batch size, sent len, hidden_dim * bidirectional] batch_size, max_seq_len, hidden_dim = output.shape hidden = torch.transpose(hidden, 1, 0) hidden = torch.mean(torch.reshape(hidden, [batch_size, -1, hidden_dim]), dim=1) output = torch.sum(output, dim=1) fc_input = self.dropout(output + hidden) # output = torch.mean(output, dim=1) # fc_input = self.dropout(output) out = self.fc_rnn(fc_input) return out class BertCNN(nn.Module): def __init__(self, config): super(BertCNN, self).__init__() self.num_filters = 100 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) self.hidden_size = self.bert_config.to_dict()['hidden_size'] self.filter_sizes = {3, 4, 5} self.drop_out = 0.5 self.convs = nn.ModuleList( [nn.Conv2d(1, self.num_filters, (k, self.hidden_size)) for k in self.filter_sizes]) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.dropout = nn.Dropout(self.drop_out) self.fc_cnn = nn.Linear(self.num_filters * len(self.filter_sizes), config.num_class) def conv_and_pool(self, x, conv): x = F.relu(conv(x)).squeeze(3) x = F.max_pool1d(x, x.size(2)).squeeze(2) return x def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = self.dropout(sequence_output) out = sequence_output.unsqueeze(1) out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) out = self.dropout(out) out = self.fc_cnn(out) return out class BertRCNN(nn.Module): def __init__(self, config): super(BertRCNN, self).__init__() self.rnn_type = "lstm" self.bidirectional = True self.hidden_dim = 256 self.n_layers = 2 self.batch_first = True self.drop_out = 0.5 config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json, output_hidden_states=True) if self.rnn_type == 'lstm': self.rnn = nn.LSTM(self.bert_config.to_dict()['hidden_size'], self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) elif self.rnn_type == 'gru': self.rnn = nn.GRU(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) else: self.rnn = nn.RNN(self.bert_config.to_dict()['hidden_size'], hidden_size=self.hidden_dim, num_layers=self.n_layers, bidirectional=self.bidirectional, batch_first=self.batch_first, dropout=self.drop_out) # self.maxpool = nn.MaxPool1d() self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) self.fc = nn.Linear(self.hidden_dim * self.n_layers, config.num_class) self.dropout = nn.Dropout(self.drop_out) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sentence_len = sequence_output.shape[1] pooler_output = pooler_output.unsqueeze(dim=1).repeat(1, sentence_len, 1) bert_sentence = sequence_output + pooler_output self.rnn.flatten_parameters() if self.rnn_type in ['rnn', 'gru']: output, hidden = self.rnn(bert_sentence) else: output, (hidden, cell) = self.rnn(bert_sentence) batch_size, max_seq_len, hidden_dim = output.shape out = torch.transpose(output.relu(), 1, 2) out = F.max_pool1d(out, max_seq_len).squeeze() out = self.fc(out) return out class XLNet(nn.Module): def __init__(self, config): super(XLNet, self).__init__() self.xlnet = XLNetModel.from_pretrained(config.model_path) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.fc = nn.Linear(self.xlnet.d_model, config.num_class) def forward(self, input_ids, input_masks, segment_ids): sequence_output = self.xlnet(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = torch.sum(sequence_output[0], dim=1) if self.isDropout: sequence_output = self.dropout(sequence_output) out = self.fc(sequence_output) return out class ElectraClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x class Electra(nn.Module): def __init__(self, config): super(Electra, self).__init__() self.electra = ElectraModel.from_pretrained(config.model_path) config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.electra_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) self.electra_config.num_labels = config.num_class self.fc = ElectraClassificationHead(self.electra_config) def forward(self, input_ids, input_masks, segment_ids): discriminator_hidden_states = self.electra(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) sequence_output = discriminator_hidden_states[0] out = self.fc(sequence_output) return out class NEZHA(nn.Module): def __init__(self, config): super(NEZHA, self).__init__() self.n_classes = config.num_class config_json = 'bert_config.json' if os.path.exists(config.model_path + 'bert_config.json') else 'config.json' self.bert_config = CONFIGS[config.model].from_pretrained(config.model_path + config_json) #self.bert_model = MODELS[config.model](config=self.bert_config) self.bert_model = MODELS[config.model].from_pretrained(config.model_path, config=self.bert_config) # NEZHA init #torch_init_model(self.bert_model, os.path.join(config.model_path, 'pytorch_model.bin')) self.isDropout = True if 0 < config.dropout < 1 else False self.dropout = nn.Dropout(p=config.dropout) self.classifier = nn.Linear(self.bert_config.hidden_size * 2, self.n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1) concat_out = torch.cat((seq_avg, pooler_output), dim=1) if self.isDropout: concat_out = self.dropout(concat_out) logit = self.classifier(concat_out) return logit ================================================ FILE: code/bert-base-count5-len32/finetuning/models/gitkeep ================================================ ================================================ FILE: code/bert-base-count5-len32/finetuning/multi_gpu_QA.py ================================================ from tqdm import tqdm, trange import numpy as np import pandas as pd import logging import torch import random import os from torch import nn, optim from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig from transformers.optimization import get_linear_schedule_with_warmup from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score from model import * from utils import * import time import logging logging.basicConfig(level=logging.DEBUG, filename="train.log",filemode='a') from NEZHA.modeling_nezha import * MODEL_CLASSES = { 'BertForClass': BertForClass, 'BertLastCls': BertLastCls, 'BertLastTwoCls': BertLastTwoCls, 'BertLastTwoClsPooler': BertLastTwoClsPooler, 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 'BertLastFourCls': BertLastFourCls, 'BertLastFourClsPooler': BertLastFourClsPooler, 'BertLastFourEmbeddings': BertLastFourEmbeddings, 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 'BertDynCls': BertDynCls, 'BertDynEmbeddings': BertDynEmbeddings, 'BertRNN': BertRNN, 'BertCNN': BertCNN, 'BertRCNN': BertRCNN, 'XLNet': XLNet, 'Electra': Electra, 'NEZHA': NEZHA, } class Config: def __init__(self): # 预训练模型路径 self.modelId = 2 self.model = "BertLastFourCls" self.Stratification = False self.model_path = '../../bert-base-count5/pretrain/bert_model/' self.num_class = 2 self.dropout = 0.2 self.MAX_LEN = 32 self.epoch = 3 self.learn_rate = 4e-5 self.normal_lr = 1e-4 self.batch_size = 32 self.k_fold = 10 self.seed = 42 self.device = torch.device('cuda') # self.device = torch.device('cpu') self.focalloss = False self.pgd = False self.fgm = True config = Config() os.environ['PYTHONHASHSEED']='0'#消除hash算法的随机性 random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) file_path = './log/' # 创建一个logger logger = logging.getLogger('mylogger') logger.setLevel(logging.DEBUG) train = pd.read_csv('/tcdata/gaiic_track3_round1_train_20210228.tsv',sep='\t',header=None) semi = pd.read_csv('/tcdata/gaiic_track3_round2_train_20210407.tsv',sep='\t',header=None) train = pd.concat([train, semi], sort=False) train.columns=['q1','q2','label'] train_query1 = train['q1'].values.astype(str) train_query2 = train['q2'].values.astype(str) train_label = train['label'].values.astype(int) oof_train = np.zeros((len(train), config.num_class), dtype=np.float32) #kf = StratifiedKFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) kf = KFold(n_splits=config.k_fold, shuffle=True, random_state=config.seed) for fold, (train_index, valid_index) in enumerate(kf.split(train_query1, train_label)): print('\n\n------------fold:{}------------\n'.format(fold)) ''' q1 = train_query1[train_index] q2 = train_query2[train_index] y = train_label[train_index] ''' q1 = train_query1 q2 = train_query2 y = train_label val_q1 = train_query1[valid_index] val_q2 = train_query2[valid_index] val_y = train_label[valid_index] train_D = data_generator([q1, q2, y], config, shuffle=True) val_D = data_generator([val_q1, val_q2, val_y], config) model = MODEL_CLASSES[config.model](config).to(config.device) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) if config.pgd: pgd = PGD(model) K = 3 elif config.fgm: fgm = FGM(model) if config.focalloss: loss_fn = FocalLoss(config.num_class) else: loss_fn = nn.CrossEntropyLoss() # BCEWithLogitsLoss就是把Sigmoid-BCELoss合成一步 num_train_steps = int(len(train) / config.batch_size * config.epoch) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if config.Stratification: bert_params = [x for x in param_optimizer if 'bert' in x[0]] normal_params = [p for n, p in param_optimizer if 'bert' not in n] optimizer_parameters = [ {'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': normal_params, 'lr': config.normal_lr}, ] else: optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr=config.learn_rate) # lr为全局学习率 scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(len(train) / config.batch_size / 2), num_training_steps=num_train_steps ) best_auc = 0 PATH = './models/bert_{}.pth'.format(fold) save_model_path = './models/' if not os.path.exists(save_model_path): os.makedirs(save_model_path) for e in range(config.epoch): print('\n------------epoch:{}------------'.format(e)) model.train() acc = 0 train_len = 0 loss_num = 0 tq = tqdm(train_D,ncols=70,disable=True) last=time.time() for input_ids, input_masks, segment_ids, labels in tq: label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) loss = loss_fn(y_pred, label_t) loss = loss.mean() loss.backward() if config.pgd: pgd.backup_grad() # 对抗训练 for t in range(K): pgd.attack(is_first_attack=(t == 0)) # 在embedding上添加对抗扰动, first attack时备份param.data if t != K - 1: model.zero_grad() else: pgd.restore_grad() y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 elif config.fgm: # 对抗训练 fgm.attack() # 在embedding上添加对抗扰动 y_pred = model(input_ids, input_masks, segment_ids) loss_adv = loss_fn(y_pred, label_t) loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 # 梯度下降,更新参数 optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() y_pred = np.argmax(y_pred.detach().to("cpu").numpy(), axis=1) acc += sum(y_pred == labels) loss_num += loss.item() train_len += len(labels) tq.set_postfix(fold=fold, epoch=e, loss=loss_num / train_len, acc=acc / train_len) print(f"微调第{e}轮耗时:{time.time()-last}") model.eval() with torch.no_grad(): y_p = [] y_l = [] train_logit = None for input_ids, input_masks, segment_ids, labels in tqdm(val_D,disable=True): label_t = torch.tensor(labels, dtype=torch.long).to(config.device) y_pred = model(input_ids, input_masks, segment_ids) y_pred = F.softmax(y_pred) y_pred = y_pred.detach().to("cpu").numpy() if train_logit is None: train_logit = y_pred else: train_logit = np.vstack((train_logit, y_pred)) y_p += list(y_pred[:,1]) y_pred = np.argmax(y_pred, axis=1) y_l += list(y_pred) f1 = f1_score(val_y, y_l, average="macro") auc_score = roc_auc_score(val_y, y_p) print("best_auc:{} auc_score:{} f1:{}\n".format(best_auc, auc_score, f1)) if auc_score >= best_auc: best_auc = auc_score oof_train[valid_index] = np.array(train_logit) #torch.save(model.module.state_dict() if hasattr(model, "module") else model.state_dict(), PATH) torch.save(model.module if hasattr(model, "module") else model, PATH) optimizer.zero_grad() del model torch.cuda.empty_cache() break ================================================ FILE: code/bert-base-count5-len32/finetuning/utils.py ================================================ import torch from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig import numpy as np import os import random from Config import * import torch import torch.nn as nn import torch.nn.functional as F def paddingList(ls:list,val,returnTensor=False): ls=ls[:]#不要改变了原list尺寸 maxLen=max([len(i) for i in ls]) for i in range(len(ls)): ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) return torch.tensor(ls,device='cuda') if returnTensor else ls def fastTokenizer(a:str,b:str,maxLen,tk): a,b=a.split(),b.split() a,b=tk.convert_tokens_to_ids(a),tk.convert_tokens_to_ids(b) maxLen-=3#空留给cls sep sep assert maxLen>=0 len2=maxLen//2#若为奇数,更长部分给左边 len1=maxLen-len2 #一共就a超长与否,b超长与否,组合的四种情况 if len(a)+len(b)>maxLen:#需要截断 if len(a)<=len1 and len(b)>len2: b=b[:maxLen-len(a)] elif len(a)>len1 and len(b)<=len2: a=a[:maxLen-len(b)] elif len(a)>len1 and len(b)>len2: a=a[:len1] b=b[:len2] input_ids=[tk.cls_token_id]+a+[tk.sep_token_id]+b+[tk.sep_token_id] token_type_ids=[0]*(len(a)+2)+[1]*(len(b)+1) return {'input_ids': input_ids, 'token_type_ids': token_type_ids} class data_generator: def __init__(self, data, config, shuffle=False): self.data = data self.batch_size = config.batch_size self.max_length = config.MAX_LEN self.shuffle = shuffle vocab = 'vocab.txt' if os.path.exists(config.model_path + 'vocab.txt') else 'spiece.model' self.tokenizer = TOKENIZERS[config.model].from_pretrained(config.model_path + vocab) self.steps = len(self.data[0]) // self.batch_size if len(self.data[0]) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self): q1, q2, y = self.data idxs = list(range(len(self.data[0]))) if self.shuffle: np.random.shuffle(idxs) input_ids, input_masks, segment_ids, labels = [], [], [], [] for index, i in enumerate(idxs): text = q1[i] text_pair = q2[i] ''' # text = self.tokenizer(text, text_pair, padding='max_length', truncation=True, max_length=self.max_length) text = fastTokenizer(text, text_pair, self.max_length, self.tokenizer) input_ids.append(text['input_ids']) segment_ids.append(text['token_type_ids']) input_masks.append([1] * len(text['input_ids'])) # bs为1时无padding,全1 yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] ''' tkRes = self.tokenizer(text, text_pair, max_length=self.max_length, truncation='longest_first', return_attention_mask=False) input_id = tkRes['input_ids'] segment_id = tkRes['token_type_ids'] assert len(segment_id) == len(input_id) input_ids.append(input_id) segment_ids.append(segment_id) labels.append(y[i]) if len(input_ids) == self.batch_size or i == idxs[-1]: input_ids = paddingList(input_ids, 0, returnTensor=True) # 动态padding segment_ids = paddingList(segment_ids, 0, returnTensor=True) input_masks = (input_ids != 0) yield input_ids, input_masks, segment_ids, labels input_ids, input_masks, segment_ids, labels = [], [], [], [] class PGD(): def __init__(self, model): self.model = model self.emb_backup = {} self.grad_backup = {} def attack(self, epsilon=0.3, alpha=0.1, emb_name='word_embeddings', is_first_attack=False): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: if is_first_attack: self.emb_backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0 and not torch.isnan(norm): r_at = alpha * param.grad / norm param.data.add_(r_at) param.data = self.project(name, param.data, epsilon) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.emb_backup param.data = self.emb_backup[name] self.emb_backup = {} def project(self, param_name, param_data, epsilon): r = param_data - self.emb_backup[param_name] if torch.norm(r) > epsilon: r = epsilon * r / torch.norm(r) return self.emb_backup[param_name] + r def backup_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: self.grad_backup[name] = param.grad.clone() def restore_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad: param.grad = self.grad_backup[name] class FGM(): def __init__(self, model): self.model = model self.backup = {} def attack(self, epsilon=0.25, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: self.backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0: r_at = epsilon * param.grad / norm param.data.add_(r_at) def restore(self, emb_name='word_embeddings'): # emb_name这个参数要换成你模型中embedding的参数名 for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.backup param.data = self.backup[name] self.backup = {} # 支持多分类和二分类 class FocalLoss(nn.Module): """ This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' Focal_Loss= -1*alpha*(1-pt)^gamma*log(pt) :param num_class: :param alpha: (tensor) 3D or 4D the scalar factor for this criterion :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more focus on hard misclassified example :param smooth: (float,double) smooth value when cross entropy :param balance_index: (int) balance class index, should be specific when alpha is float :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch. """ def __init__(self, num_class, alpha=None, gamma=2, smooth=None, size_average=True): super(FocalLoss, self).__init__() self.num_class = num_class self.alpha = alpha self.gamma = gamma self.smooth = smooth self.size_average = size_average if self.alpha is None: self.alpha = torch.ones(self.num_class, 1) elif isinstance(self.alpha, (list, np.ndarray)): assert len(self.alpha) == self.num_class self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1) self.alpha = self.alpha / self.alpha.sum() else: raise TypeError('Not support alpha type') if self.smooth is not None: if self.smooth < 0 or self.smooth > 1.0: raise ValueError('smooth value should be in [0,1]') def forward(self, input, target): logit = F.softmax(input, dim=1) if logit.dim() > 2: # N,C,d1,d2 -> N,C,m (m=d1*d2*...) logit = logit.view(logit.size(0), logit.size(1), -1) logit = logit.permute(0, 2, 1).contiguous() logit = logit.view(-1, logit.size(-1)) target = target.view(-1, 1) # N = input.size(0) # alpha = torch.ones(N, self.num_class) # alpha = alpha * (1 - self.alpha) # alpha = alpha.scatter_(1, target.long(), self.alpha) epsilon = 1e-10 alpha = self.alpha if alpha.device != input.device: alpha = alpha.to(input.device) idx = target.cpu().long() one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() one_hot_key = one_hot_key.scatter_(1, idx, 1) if one_hot_key.device != logit.device: one_hot_key = one_hot_key.to(logit.device) if self.smooth: one_hot_key = torch.clamp( one_hot_key, self.smooth, 1.0 - self.smooth) pt = (one_hot_key * logit).sum(1) + epsilon logpt = pt.log() gamma = self.gamma alpha = alpha[idx] loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt if self.size_average: loss = loss.mean() else: loss = loss.sum() return loss def f1_match(y_true,y_pred): acc = sum(y_pred & y_true) / (sum(y_pred)) rec = sum(y_pred & y_true) / (sum(y_true)) return 2 * acc * rec /(acc + rec) ================================================ FILE: code/build_vocab.py ================================================ from collections import Counter def loadData(path): allData=[] with open(path,"r") as f: for i in f: i=i.strip().split('\t') if len(i)==0:#防止空行 break if len(i)==3:#训练集 a,b,label=i else:#测试集,直接转为id形式 a,b,label=i[0],i[1],-1 a,b=[int(i) for i in a.split()],[int(i) for i in b.split()] allData.append([a,b]) return allData allData=loadData('/tcdata/gaiic_track3_round1_train_20210228.tsv')+loadData('/tcdata/gaiic_track3_round2_train_20210407.tsv') test_data = loadData('/tcdata/gaiic_track3_round1_testA_20210228.tsv')+loadData('/tcdata/gaiic_track3_round1_testB_20210317.tsv') model_lists = ["nezha-base-count3", "nezha-base-count5", "bert-base-count3", "bert-base-count3-len100", "bert-base-count5", "bert-base-count5-len32"] childPath_lists=[ ['/pretrain/nezha_model/','/finetuning/models/'], ['/pretrain/nezha_model/','/finetuning/models/'], ['/pretrain/bert_model/','/finetuning/models/'], ['/finetuning/models/'], ['/pretrain/bert_model/','/finetuning/models/'], ['/finetuning/models/'], ] counts=[3,5,3,3,5,5] token2count=Counter() for i,j in allData+test_data: token2count.update(i+j) for modelPath,childPath,ct in zip(model_lists,childPath_lists,counts): pre=['[PAD]','[UNK]','[CLS]','[SEP]','[MASK]',] tail=[] for k,v in token2count.items(): if v>=ct: tail.append(k) tail.sort() vocab=pre+tail print(f"模型{modelPath},词频:{ct},词表大小:{len(vocab)}") for ch in childPath: with open(modelPath+ch+'vocab.txt', "w", encoding="utf-8") as f: for i in vocab: f.write(str(i)+'\n') ================================================ FILE: code/docker_build.sh ================================================ #!/bin/bash if [ -z $1 ] then echo "s1:version" exit fi if [ -z $DOCKER_REGISTRY ] then echo "not find $DOCKER_REGISTRY" exit fi VERSION=$1 ##1.create docker regsit docker build -t $DOCKER_REGISTRY/tianchi-submit:$VERSION . ##2.PUSH docker push $DOCKER_REGISTRY/tianchi-submit:$VERSION ##3.echo submit info echo "now you can go to tianchi.aliyun.com sunmit this docker url: $DOCKER_REGISTRY/tianchi-submit:$VERSION ================================================ FILE: code/main_fusion_thread.py ================================================ import logging import traceback from flask import Flask, request from utils import * from queue import Queue import threading opset_version = 11 # 此处示例,需要根据模型类型重写 def init_model(model_path, export_model_path, optimized_model_path, length=32): model = torch.load(model_path).to(torch.device("cuda")) model.eval() if length == 32: data = [[[2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 20, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]] else: data = [[[2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458]], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]] inputs = { 'input_ids': torch.tensor(data[0]).to(config.device), 'input_masks': torch.tensor(data[1]).to(config.device), 'segment_ids': torch.tensor(data[2]).to(config.device) } if True or not os.path.exists(export_model_path): with torch.no_grad(): symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} torch.onnx.export(model, # model being run args=tuple(inputs.values()), # model input (or a tuple for multiple inputs) f=export_model_path, # where to save the model (can be a file or file-like object) opset_version=opset_version, # the ONNX version to export the model to do_constant_folding=True, # whether to execute constant folding for optimization input_names=['input_ids', # the model's input names 'input_masks', 'segment_ids'], output_names=['predict'], # the model's output names dynamic_axes={'input_ids': symbolic_names, # variable length axes 'input_masks': symbolic_names, 'segment_ids': symbolic_names, 'predict': symbolic_names}) print("Model exported at ", export_model_path) from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( export_model_path, 'bert', num_heads=12, hidden_size=768, optimization_options=opt_options) opt_model.save_model_to_file(optimized_model_path) del model torch.cuda.empty_cache() import psutil import onnxruntime assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers() sess_options = onnxruntime.SessionOptions() sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) session = onnxruntime.InferenceSession(optimized_model_path, sess_options) ort_inputs = { 'input_ids': [[0]*32], 'input_masks': [[0]*32], 'segment_ids': [[0]*32] } session.run(None, ort_inputs)#预先启动一下 return session def infer(session,config,inp:Queue,res:Queue): data_gen = data_generator(config) while True: query_A, query_B=inp.get()#不断从自己队列中取 input_ids, input_masks, segment_ids = data_gen.generate((query_A, query_B)) ort_inputs = { 'input_ids': input_ids, 'input_masks': input_masks, 'segment_ids': segment_ids } y_pred = session.run(None, ort_inputs) res.put(y_pred[0])#结果放入队列 def softmax(x, axis=1): # 计算每行的最大值 row_max = x.max(axis=axis) # 每行元素都需要减去对应的最大值,否则求exp(x)会溢出,导致inf情况 row_max = row_max.reshape(-1, 1) x = x - row_max # 计算e的指数次幂 x_exp = np.exp(x) x_sum = np.sum(x_exp, axis=axis, keepdims=True) s = x_exp / x_sum return s class Config: def __init__(self): # 预训练模型路径 self.modelId = 2 self.model = "NEZHA" self.Stratification = False self.model_path = 'model0/' self.num_class = 2 self.dropout = 0.2 self.MAX_LEN = 32 self.epoch = 5 self.learn_rate = 2e-5 self.normal_lr = 1e-4 self.batch_size = 1 self.k_fold = 5 self.seed = 42 self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.focalloss = False self.pgd = False self.fgm = True # 允许使用类似Flask的别的服务方式 app = Flask(__name__) @app.route("/tccapi", methods=['GET', 'POST']) def tccapi(): data = request.get_data() if (data == b"exit"): print("received exit command, exit now") os._exit(0) input_list = request.form.getlist("input") index_list = request.form.getlist("index") response_batch = {} response_batch["results"] = [] for i in range(len(index_list)): index_str = index_list[i] response = {} try: input_sample = input_list[i].strip() elems = input_sample.strip().split("\t") query_A = elems[0].strip() query_B = elems[1].strip() for i in runningModelIds[:3]:#先只用上前三个模型 assert inps[i].qsize()==0 inps[i].put((query_A,query_B))#为子线程提供数据 predict_res=[] for i in runningModelIds[:3]: predict_res.append(res.get())#取3次 assert res.qsize()==0 y_pred = np.mean(predict_res, axis=0) y_pred = softmax(np.array(y_pred)) if 0.15